mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
convert-llama-ggml-to-gguf: Try to handle files older than GGJTv3 (#3023)
* convert-llama-ggmlv3-to-gguf: Try to handle files older than GGJTv3 * Better error messages for files that cannot be converted * Add file type to GGUF output * Rename to convert-llama-ggml-to-gguf.py * Include original file type information in description * Improve some informational output
This commit is contained in:
parent
9912b9efc8
commit
ea2c85d5d2
@ -5,6 +5,7 @@ import argparse
|
|||||||
import math
|
import math
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
|
|||||||
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class GGMLFormat(IntEnum):
|
||||||
|
GGML = 0
|
||||||
|
GGMF = 1
|
||||||
|
GGJT = 2
|
||||||
|
|
||||||
|
class GGMLFType(IntEnum):
|
||||||
|
ALL_F32 = 0
|
||||||
|
MOSTLY_F16 = 1
|
||||||
|
MOSTLY_Q4_0 = 2
|
||||||
|
MOSTLY_Q4_1 = 3
|
||||||
|
MOSTLY_Q4_1_SOME_F16 = 4
|
||||||
|
MOSTLY_Q8_0 = 7
|
||||||
|
MOSTLY_Q5_0 = 8
|
||||||
|
MOSTLY_Q5_1 = 9
|
||||||
|
MOSTLY_Q2_K = 10
|
||||||
|
MOSTLY_Q3_K_S = 11
|
||||||
|
MOSTLY_Q3_K_M = 12
|
||||||
|
MOSTLY_Q3_K_L = 13
|
||||||
|
MOSTLY_Q4_K_S = 14
|
||||||
|
MOSTLY_Q4_K_M = 15
|
||||||
|
MOSTLY_Q5_K_S = 16
|
||||||
|
MOSTLY_Q5_K_M = 17
|
||||||
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
self.n_ff = 0
|
self.n_layer = self.n_rot = self.n_ff = 0
|
||||||
|
self.ftype = GGMLFType.ALL_F32
|
||||||
|
|
||||||
def set_n_ff(self, model):
|
def set_n_ff(self, model):
|
||||||
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
||||||
@ -53,16 +79,21 @@ class Hyperparameters:
|
|||||||
self.n_head,
|
self.n_head,
|
||||||
self.n_layer,
|
self.n_layer,
|
||||||
self.n_rot,
|
self.n_rot,
|
||||||
self.ftype,
|
ftype,
|
||||||
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
||||||
|
try:
|
||||||
|
self.ftype = GGMLFType(ftype)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f'Invalid ftype {ftype}')
|
||||||
return 4 * 7
|
return 4 * 7
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
|
self.load_scores = load_scores
|
||||||
|
|
||||||
def load(self, data, offset, n_vocab):
|
def load(self, data, offset, n_vocab):
|
||||||
orig_offset = offset
|
orig_offset = offset
|
||||||
@ -70,20 +101,24 @@ class Vocab:
|
|||||||
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
||||||
assert itemlen < 4096, 'Absurd vocab item length'
|
assert itemlen < 4096, 'Absurd vocab item length'
|
||||||
offset += 4
|
offset += 4
|
||||||
vocab = bytes(data[offset:offset + itemlen])
|
item_text = bytes(data[offset:offset + itemlen])
|
||||||
offset += itemlen
|
offset += itemlen
|
||||||
score = struct.unpack('<f', data[offset:offset + 4])[0]
|
if self.load_scores:
|
||||||
|
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
|
||||||
offset += 4
|
offset += 4
|
||||||
self.items.append((vocab, score))
|
else:
|
||||||
|
item_score = 0.0
|
||||||
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
self.dims: tuple[int, ...] = ()
|
self.dims: tuple[int, ...] = ()
|
||||||
self.dtype = None
|
self.dtype = None
|
||||||
self.start_offset = 0
|
self.start_offset = 0
|
||||||
self.len_bytes = np.int64(0)
|
self.len_bytes = np.int64(0)
|
||||||
|
self.use_padding = use_padding
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
orig_offset = offset
|
orig_offset = offset
|
||||||
@ -99,7 +134,7 @@ class Tensor:
|
|||||||
offset += 4 * n_dims
|
offset += 4 * n_dims
|
||||||
self.name = bytes(data[offset:offset + name_len])
|
self.name = bytes(data[offset:offset + name_len])
|
||||||
offset += name_len
|
offset += name_len
|
||||||
pad = ((offset + 31) & ~31) - offset
|
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
|
||||||
offset += pad
|
offset += pad
|
||||||
n_elems = np.prod(self.dims)
|
n_elems = np.prod(self.dims)
|
||||||
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
||||||
@ -109,7 +144,7 @@ class Tensor:
|
|||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
class GGMLV3Model:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
@ -117,20 +152,52 @@ class GGMLV3Model:
|
|||||||
self.tensors = []
|
self.tensors = []
|
||||||
|
|
||||||
def validate_header(self, data, offset):
|
def validate_header(self, data, offset):
|
||||||
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
|
magic = bytes(data[offset:offset + 4])
|
||||||
raise ValueError('Only GGJTv3 supported')
|
if magic == b'GGUF':
|
||||||
|
raise ValueError('File is already in GGUF format.')
|
||||||
|
if magic == b'lmgg':
|
||||||
|
self.file_format = GGMLFormat.GGML
|
||||||
|
self.format_version = 1
|
||||||
|
return 4
|
||||||
|
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
|
||||||
|
if magic == b'fmgg':
|
||||||
|
if version != 1:
|
||||||
|
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
|
||||||
|
self.file_format = GGMLFormat.GGMF
|
||||||
|
self.format_version = version
|
||||||
return 8
|
return 8
|
||||||
|
if magic == b'tjgg':
|
||||||
|
if version < 1 or version > 3:
|
||||||
|
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
|
||||||
|
self.file_format = GGMLFormat.GGJT
|
||||||
|
self.format_version = version
|
||||||
|
return 8
|
||||||
|
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
|
||||||
|
|
||||||
|
def validate_conversion(self, ftype):
|
||||||
|
err = ''
|
||||||
|
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
|
||||||
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
|
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
|
if len(err) > 0:
|
||||||
|
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
offset += self.validate_header(data, offset)
|
offset += self.validate_header(data, offset)
|
||||||
hp = Hyperparameters()
|
hp = Hyperparameters()
|
||||||
offset += hp.load(data, offset)
|
offset += hp.load(data, offset)
|
||||||
vocab = Vocab()
|
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
||||||
|
self.validate_conversion(hp.ftype)
|
||||||
|
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
|
||||||
offset += vocab.load(data, offset, hp.n_vocab)
|
offset += vocab.load(data, offset, hp.n_vocab)
|
||||||
tensors: list[Tensor] = []
|
tensors: list[Tensor] = []
|
||||||
tensor_map = {}
|
tensor_map = {}
|
||||||
while offset < len(data):
|
while offset < len(data):
|
||||||
tensor = Tensor()
|
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
|
||||||
offset += tensor.load(data, offset)
|
offset += tensor.load(data, offset)
|
||||||
tensor_map[tensor.name] = len(tensors)
|
tensor_map[tensor.name] = len(tensors)
|
||||||
tensors.append(tensor)
|
tensors.append(tensor)
|
||||||
@ -168,7 +235,10 @@ class GGMLToGGUF:
|
|||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
print('* Preparing to save GGUF file')
|
print('* Preparing to save GGUF file')
|
||||||
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
gguf_writer = gguf.GGUFWriter(
|
||||||
|
self.cfg.output,
|
||||||
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
|
use_temp_file = False )
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
@ -185,7 +255,10 @@ class GGMLToGGUF:
|
|||||||
def add_params(self, gguf_writer):
|
def add_params(self, gguf_writer):
|
||||||
hp = self.model.hyperparameters
|
hp = self.model.hyperparameters
|
||||||
cfg = self.cfg
|
cfg = self.cfg
|
||||||
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
|
if cfg.desc is not None:
|
||||||
|
desc = cfg.desc
|
||||||
|
else:
|
||||||
|
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
|
||||||
try:
|
try:
|
||||||
# Filenames aren't necessarily valid UTF8.
|
# Filenames aren't necessarily valid UTF8.
|
||||||
name = cfg.name if cfg.name is not None else cfg.input.name
|
name = cfg.name if cfg.name is not None else cfg.input.name
|
||||||
@ -195,6 +268,7 @@ class GGMLToGGUF:
|
|||||||
if name is not None:
|
if name is not None:
|
||||||
gguf_writer.add_name(name)
|
gguf_writer.add_name(name)
|
||||||
gguf_writer.add_description(desc)
|
gguf_writer.add_description(desc)
|
||||||
|
gguf_writer.add_file_type(int(hp.ftype))
|
||||||
if self.params_override is not None:
|
if self.params_override is not None:
|
||||||
po = self.params_override
|
po = self.params_override
|
||||||
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
||||||
@ -231,7 +305,8 @@ class GGMLToGGUF:
|
|||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
assert len(tokens) == hp.n_vocab, \
|
||||||
|
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
if len(toktypes) > 0:
|
if len(toktypes) > 0:
|
||||||
@ -283,7 +358,11 @@ class GGMLToGGUF:
|
|||||||
tempdims[1] = tempdims[0]
|
tempdims[1] = tempdims[0]
|
||||||
tempdims[0] = temp
|
tempdims[0] = temp
|
||||||
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
||||||
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
|
gguf_writer.add_tensor(
|
||||||
|
mapped_name,
|
||||||
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
|
raw_shape = tempdims,
|
||||||
|
raw_dtype = tensor.dtype )
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
|
|||||||
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
vocab = convert.load_vocab(
|
||||||
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
|
cfg.vocabtype )
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
|
help = 'Input GGMLv3 filename')
|
||||||
parser.add_argument('--name', help = 'Set model name')
|
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||||
parser.add_argument('--desc', help = 'Set model description')
|
help ='Output GGUF filename')
|
||||||
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
parser.add_argument('--name',
|
||||||
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
help = 'Set model name')
|
||||||
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
parser.add_argument('--desc',
|
||||||
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
help = 'Set model description')
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
parser.add_argument('--gqa', type = int, default = 1,
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
|
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
|
parser.add_argument('--eps', default = '5.0e-06',
|
||||||
|
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
|
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||||
|
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
|
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||||
|
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
|
parser.add_argument("--vocab-dir", type=Path,
|
||||||
|
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||||
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
||||||
|
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
|
||||||
|
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLV3Model()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0)
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
@ -345,7 +438,12 @@ def main():
|
|||||||
print(f'* Special vocab: {special_vocab}')
|
print(f'* Special vocab: {special_vocab}')
|
||||||
else:
|
else:
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
|
if model.file_format == GGMLFormat.GGML:
|
||||||
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
|
converter = GGMLToGGUF(model, data, cfg,
|
||||||
|
params_override = params_override,
|
||||||
|
vocab_override = vocab_override,
|
||||||
|
special_vocab = special_vocab )
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user