mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-23 18:09:18 +01:00
convert : refactor vocab selection logic (#6355)
This commit is contained in:
parent
66ba560256
commit
be55134a53
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import HfVocab
|
from convert import LlamaHfVocab
|
||||||
|
|
||||||
|
|
||||||
###### MODEL DEFINITIONS ######
|
###### MODEL DEFINITIONS ######
|
||||||
@ -230,7 +230,7 @@ class Model(ABC):
|
|||||||
def _set_vocab_gpt2(self):
|
def _set_vocab_gpt2(self):
|
||||||
dir_model = self.dir_model
|
dir_model = self.dir_model
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
tokens: list[bytearray] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -243,8 +243,7 @@ class Model(ABC):
|
|||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
pad_token = f"[PAD{i}]".encode('utf-8')
|
tokens.append(f"[PAD{i}]")
|
||||||
tokens.append(bytearray(pad_token))
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
@ -266,7 +265,7 @@ class Model(ABC):
|
|||||||
def _set_vocab_qwen(self):
|
def _set_vocab_qwen(self):
|
||||||
dir_model = self.dir_model
|
dir_model = self.dir_model
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
tokens: list[bytearray] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -291,8 +290,7 @@ class Model(ABC):
|
|||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
tokens.append(f"[PAD{i}]")
|
||||||
tokens.append(bytearray(pad_token))
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
@ -372,12 +370,8 @@ class Model(ABC):
|
|||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
path = self.dir_model
|
vocab = LlamaHfVocab(self.dir_model)
|
||||||
added_tokens_path = self.dir_model
|
|
||||||
vocab = HfVocab(
|
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
|
|||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_hf()
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
@ -1700,11 +1694,8 @@ class BertModel(Model):
|
|||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
|
||||||
added_tokens_path = self.dir_model if self.dir_model.exists() else None
|
|
||||||
|
|
||||||
# use huggingface vocab to get all tokens
|
# use huggingface vocab to get all tokens
|
||||||
vocab = HfVocab(path, added_tokens_path)
|
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
assert len(tokens) == vocab.vocab_size
|
assert len(tokens) == vocab.vocab_size
|
||||||
self.vocab_size = vocab.vocab_size
|
self.vocab_size = vocab.vocab_size
|
||||||
|
@ -106,12 +106,12 @@ def main():
|
|||||||
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
||||||
print(tensor_map)
|
print(tensor_map)
|
||||||
for name in tensors.keys():
|
for name in tensors.keys():
|
||||||
data = tensors[name]
|
data_torch = tensors[name]
|
||||||
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||||
continue
|
continue
|
||||||
old_dtype = data.dtype
|
old_dtype = data_torch.dtype
|
||||||
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
||||||
data = data.to(torch.float32).squeeze().numpy()
|
data = data_torch.to(torch.float32).squeeze().numpy()
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
if new_name is None:
|
if new_name is None:
|
||||||
print("Can not map tensor '" + name + "'")
|
print("Can not map tensor '" + name + "'")
|
||||||
|
341
convert.py
341
convert.py
@ -16,13 +16,14 @@ import re
|
|||||||
import signal
|
import signal
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
import textwrap
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
@ -43,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
|
|||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
|
|
||||||
|
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||||
|
FAST_TOKENIZER_FILE = 'tokenizer.json'
|
||||||
|
|
||||||
#
|
#
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
@ -188,8 +192,10 @@ class Params:
|
|||||||
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
msg = """\
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
failed to guess 'n_layer'. This model is unknown or unsupported.
|
||||||
|
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
||||||
|
raise KeyError(textwrap.dedent(msg))
|
||||||
|
|
||||||
n_head = n_embd // 128 # guessed
|
n_head = n_embd // 128 # guessed
|
||||||
n_mult = 256 # guessed
|
n_mult = 256 # guessed
|
||||||
@ -211,7 +217,8 @@ class Params:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
@ -233,8 +240,10 @@ class Params:
|
|||||||
elif "max_position_embeddings" in config:
|
elif "max_position_embeddings" in config:
|
||||||
n_ctx = config["max_position_embeddings"]
|
n_ctx = config["max_position_embeddings"]
|
||||||
else:
|
else:
|
||||||
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
msg = """\
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
failed to guess 'n_ctx'. This model is unknown or unsupported.
|
||||||
|
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
||||||
|
raise KeyError(textwrap.dedent(msg))
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
@ -265,7 +274,8 @@ class Params:
|
|||||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
@ -331,47 +341,86 @@ class Params:
|
|||||||
# vocab
|
# vocab
|
||||||
#
|
#
|
||||||
|
|
||||||
class BpeVocab:
|
@runtime_checkable
|
||||||
|
class BaseVocab(Protocol):
|
||||||
|
tokenizer_model: ClassVar[str]
|
||||||
|
name: ClassVar[str]
|
||||||
|
|
||||||
|
|
||||||
|
class NoVocab(BaseVocab):
|
||||||
|
tokenizer_model = "no_vocab"
|
||||||
|
name = "no_vocab"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Vocab(BaseVocab, Protocol):
|
||||||
|
vocab_size: int
|
||||||
|
added_tokens_dict: dict[str, int]
|
||||||
|
added_tokens_list: list[str]
|
||||||
|
fname_tokenizer: Path
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path): ...
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
||||||
|
|
||||||
|
|
||||||
|
class BpeVocab(Vocab):
|
||||||
tokenizer_model = "gpt2"
|
tokenizer_model = "gpt2"
|
||||||
name = "bpe"
|
name = "bpe"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, base_path: Path):
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
added_tokens: dict[str, int] = {}
|
||||||
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
|
||||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
|
||||||
else:
|
|
||||||
self.vocab = self.bpe_tokenizer
|
|
||||||
added_tokens: dict[str, int]
|
|
||||||
if fname_added_tokens is not None:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
|
||||||
else:
|
|
||||||
# Fall back to trying to find the added tokens in tokenizer.json
|
|
||||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
added_tokens = {}
|
|
||||||
else:
|
|
||||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
|
||||||
added_tokens = dict(
|
|
||||||
(item['content'], item['id'])
|
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
if item['content'] not in self.bpe_tokenizer)
|
|
||||||
|
|
||||||
vocab_size: int = len(self.vocab)
|
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
# "slow" tokenizer
|
||||||
actual_ids = sorted(added_tokens.values())
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
self.vocab = json.load(f)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# "fast" tokenizer
|
||||||
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
|
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if (
|
||||||
|
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
||||||
|
|
||||||
|
self.vocab = tokenizer_model["vocab"]
|
||||||
|
|
||||||
|
if (added := tokenizer_json.get('added_tokens')) is not None:
|
||||||
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
|
added_tokens = {item['content']: item['id']
|
||||||
|
for item in added
|
||||||
|
if item['content'] not in self.vocab}
|
||||||
|
|
||||||
|
vocab_size = len(self.vocab)
|
||||||
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
||||||
|
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
@ -392,19 +441,25 @@ class BpeVocab:
|
|||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "spm"
|
name = "spm"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, base_path: Path):
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
added_tokens: dict[str, int] = {}
|
||||||
added_tokens: dict[str, int]
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
||||||
if fname_added_tokens is not None:
|
# normal location
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
try:
|
||||||
else:
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
added_tokens = {}
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
||||||
|
# not found in alternate location either
|
||||||
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
@ -414,18 +469,17 @@ class SentencePieceVocab:
|
|||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
# Token pieces that were added to the base vocabulary.
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
self.vocab_size_base = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
text: bytes = piece.encode("utf-8")
|
text = piece.encode("utf-8")
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.get_score(i)
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
@ -458,27 +512,42 @@ class SentencePieceVocab:
|
|||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class HfVocab:
|
class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
||||||
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
# pre-check so we know if we need transformers
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if ignore_nonllama:
|
||||||
|
pass # workaround incorrect use of this class for WordPiece
|
||||||
|
elif (
|
||||||
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"To use HfVocab, please install the `transformers` package. "
|
"To use LlamaHfVocab, please install the `transformers` package. "
|
||||||
"You can install it with `pip install transformers`."
|
"You can install it with `pip install transformers`."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
print("fname_tokenizer:", fname_tokenizer)
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
# Explicitly set tokenizer to use local paths.
|
# Explicitly set tokenizer to use local paths.
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
fname_tokenizer,
|
base_path,
|
||||||
cache_dir=fname_tokenizer,
|
cache_dir=base_path,
|
||||||
local_files_only=True,
|
local_files_only=True,
|
||||||
)
|
)
|
||||||
|
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
# Initialize lists and dictionaries for added tokens
|
||||||
self.added_tokens_list = []
|
self.added_tokens_list = []
|
||||||
@ -506,8 +575,7 @@ class HfVocab:
|
|||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
self.vocab_size_base = self.tokenizer.vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {
|
reverse_vocab = {
|
||||||
@ -559,18 +627,7 @@ class HfVocab:
|
|||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class NoVocab:
|
|
||||||
tokenizer_model = "no_vocab"
|
|
||||||
name = "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -588,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
|||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
|
||||||
class Tensor(metaclass=ABCMeta):
|
class Tensor(ABC):
|
||||||
data_type: DataType
|
data_type: DataType
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -610,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
|
|||||||
|
|
||||||
|
|
||||||
class UnquantizedTensor(Tensor):
|
class UnquantizedTensor(Tensor):
|
||||||
def __init__(self, ndarray: NDArray) -> None:
|
def __init__(self, ndarray: NDArray):
|
||||||
assert isinstance(ndarray, np.ndarray)
|
assert isinstance(ndarray, np.ndarray)
|
||||||
self.ndarray = ndarray
|
self.ndarray = ndarray
|
||||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||||
@ -689,7 +746,7 @@ class ModelPlus:
|
|||||||
model: LazyModel
|
model: LazyModel
|
||||||
paths: list[Path] # Where this was read from.
|
paths: list[Path] # Where this was read from.
|
||||||
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
||||||
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
|
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
|
||||||
|
|
||||||
|
|
||||||
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
||||||
@ -698,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
|||||||
names = {name: None for model in models for name in model}
|
names = {name: None for model in models for name in model}
|
||||||
|
|
||||||
def convert(name: str) -> LazyTensor:
|
def convert(name: str) -> LazyTensor:
|
||||||
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
|
lazy_tensors = [model[name] for model in models]
|
||||||
if len(lazy_tensors) == 1:
|
if len(lazy_tensors) == 1:
|
||||||
# only one file; don't go through this procedure since there might
|
# only one file; don't go through this procedure since there might
|
||||||
# be quantized tensors
|
# be quantized tensors
|
||||||
@ -719,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
|||||||
|
|
||||||
def load() -> UnquantizedTensor:
|
def load() -> UnquantizedTensor:
|
||||||
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
|
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
|
||||||
concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
|
concatenated = np.concatenate(ndarrays, axis=axis)
|
||||||
return UnquantizedTensor(concatenated)
|
return UnquantizedTensor(concatenated)
|
||||||
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
|
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
|
||||||
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
|
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
|
||||||
@ -807,10 +864,10 @@ class LazyUnpickler(pickle.Unpickler):
|
|||||||
|
|
||||||
def load(offset: int, elm_count: int) -> NDArray:
|
def load(offset: int, elm_count: int) -> NDArray:
|
||||||
dtype = data_type.dtype
|
dtype = data_type.dtype
|
||||||
fp = self.zip_file.open(info)
|
with self.zip_file.open(info) as fp:
|
||||||
fp.seek(offset * dtype.itemsize)
|
fp.seek(offset * dtype.itemsize)
|
||||||
size = elm_count * dtype.itemsize
|
size = elm_count * dtype.itemsize
|
||||||
data = fp.read(size)
|
data = fp.read(size)
|
||||||
assert len(data) == size
|
assert len(data) == size
|
||||||
return np.frombuffer(data, dtype)
|
return np.frombuffer(data, dtype)
|
||||||
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
||||||
@ -831,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
|
|||||||
def rebuild_from_type_v2(func, new_type, args, state):
|
def rebuild_from_type_v2(func, new_type, args, state):
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES: dict[tuple[str, str], Any] = {
|
CLASSES = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
@ -890,7 +947,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
|||||||
def must_read(fp: IO[bytes], length: int) -> bytes:
|
def must_read(fp: IO[bytes], length: int) -> bytes:
|
||||||
ret = fp.read(length)
|
ret = fp.read(length)
|
||||||
if len(ret) < length:
|
if len(ret) < length:
|
||||||
raise Exception("unexpectedly reached end of file")
|
raise EOFError("unexpectedly reached end of file")
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
@ -948,13 +1005,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
|
||||||
# Handle special case where the model's vocab size is not set
|
# Handle special case where the model's vocab size is not set
|
||||||
if params.n_vocab == -1:
|
if params.n_vocab == -1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
|
"The model's vocab size is set to -1 in params.json. Please update it manually."
|
||||||
|
+ (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
|
||||||
)
|
)
|
||||||
if isinstance(vocab, NoVocab):
|
if not isinstance(vocab, Vocab):
|
||||||
return # model has no vocab
|
return # model has no vocab
|
||||||
|
|
||||||
# Check for a vocab size mismatch
|
# Check for a vocab size mismatch
|
||||||
@ -979,11 +1037,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
|||||||
if vocab.vocab_size < params.n_vocab:
|
if vocab.vocab_size < params.n_vocab:
|
||||||
msg += " Add the --pad-vocab option and try again."
|
msg += " Add the --pad-vocab option and try again."
|
||||||
|
|
||||||
raise Exception(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
@ -1034,8 +1092,6 @@ class OutputFile:
|
|||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||||
assert not isinstance(vocab, NoVocab)
|
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
@ -1135,7 +1191,7 @@ class OutputFile:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(
|
def write_all(
|
||||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
pad_vocab: bool = False,
|
pad_vocab: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -1145,11 +1201,11 @@ class OutputFile:
|
|||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
if isinstance(vocab, NoVocab):
|
if isinstance(vocab, Vocab):
|
||||||
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
|
||||||
else:
|
|
||||||
of.add_meta_vocab(vocab)
|
of.add_meta_vocab(vocab)
|
||||||
of.add_meta_special_vocab(svocab)
|
of.add_meta_special_vocab(svocab)
|
||||||
|
else: # NoVocab
|
||||||
|
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
for name, lazy_tensor in model.items():
|
for name, lazy_tensor in model.items():
|
||||||
@ -1176,7 +1232,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||||||
|
|
||||||
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
@ -1186,7 +1242,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
|
|||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
|
|
||||||
tmp = model
|
tmp = model
|
||||||
|
|
||||||
@ -1213,8 +1269,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
|
|||||||
if skip_unknown:
|
if skip_unknown:
|
||||||
print(f"Unexpected tensor name: {name} - skipping")
|
print(f"Unexpected tensor name: {name} - skipping")
|
||||||
continue
|
continue
|
||||||
else:
|
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
||||||
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
|
||||||
|
|
||||||
if tensor_type in should_skip:
|
if tensor_type in should_skip:
|
||||||
print(f"skipping tensor {name_new}")
|
print(f"skipping tensor {name_new}")
|
||||||
@ -1231,7 +1286,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
|
|||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
'''
|
'''
|
||||||
# Support the following patterns:
|
# Support the following patterns:
|
||||||
patterns: list[tuple[str, str]] = [
|
patterns = [
|
||||||
# - x.00.pth, x.01.pth, etc.
|
# - x.00.pth, x.01.pth, etc.
|
||||||
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
||||||
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
||||||
@ -1277,9 +1332,9 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
||||||
files = [file for glob in globs for file in path.glob(glob)]
|
files = [file for glob in globs for file in path.glob(glob)]
|
||||||
if not files:
|
if not files:
|
||||||
raise Exception(f"Can't find model in directory {path}")
|
raise FileNotFoundError(f"Can't find model in directory {path}")
|
||||||
if len(files) > 1:
|
if len(files) > 1:
|
||||||
raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
|
raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
|
||||||
path = files[0]
|
path = files[0]
|
||||||
|
|
||||||
paths = find_multifile_paths(path)
|
paths = find_multifile_paths(path)
|
||||||
@ -1293,36 +1348,14 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||||||
|
|
||||||
|
|
||||||
class VocabFactory:
|
class VocabFactory:
|
||||||
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
|
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
|
||||||
|
|
||||||
def __init__(self, path: Path):
|
def __init__(self, path: Path):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.file_paths = self._detect_files()
|
|
||||||
print(f"Found vocab files: {self.file_paths}")
|
|
||||||
|
|
||||||
def _detect_files(self) -> dict[str, Path | None]:
|
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
def locate(file: str) -> Path | None:
|
|
||||||
if (path := self.path / file).exists():
|
|
||||||
return path
|
|
||||||
if (path := self.path.parent / file).exists():
|
|
||||||
return path
|
|
||||||
return None
|
|
||||||
|
|
||||||
return {vt: locate(f) for vt, f in self._FILES.items()}
|
|
||||||
|
|
||||||
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
|
|
||||||
for vtype in vocab_types:
|
|
||||||
try:
|
|
||||||
path = self.file_paths[vtype]
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
|
||||||
if path is not None:
|
|
||||||
return vtype, path
|
|
||||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
|
||||||
|
|
||||||
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
|
||||||
load_merges = vocab.name == "bpe"
|
load_merges = vocab.name == "bpe"
|
||||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
|
||||||
return gguf.SpecialVocab(
|
return gguf.SpecialVocab(
|
||||||
model_parent_path,
|
model_parent_path,
|
||||||
load_merges=load_merges,
|
load_merges=load_merges,
|
||||||
@ -1331,27 +1364,29 @@ class VocabFactory:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
||||||
vocab_type, path = self._select_file(vocab_types)
|
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
|
||||||
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
selected_vocabs: dict[str, type[Vocab]] = {}
|
||||||
|
for vtype in vocab_types:
|
||||||
|
try:
|
||||||
|
selected_vocabs[vtype] = vocab_classes[vtype]
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
for vtype, cls in selected_vocabs.items():
|
||||||
if vocab_type == "bpe":
|
try:
|
||||||
return BpeVocab(
|
vocab = cls(self.path)
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
break
|
||||||
)
|
except FileNotFoundError:
|
||||||
if vocab_type == "spm":
|
pass # ignore unavailable tokenizers
|
||||||
return SentencePieceVocab(
|
else:
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
|
||||||
)
|
|
||||||
if vocab_type == "hfft":
|
|
||||||
return HfVocab(
|
|
||||||
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
raise ValueError(vocab_type)
|
|
||||||
|
|
||||||
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
||||||
vocab: Vocab
|
return vocab
|
||||||
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
|
|
||||||
|
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
||||||
|
vocab: BaseVocab
|
||||||
|
if vocab_types is None:
|
||||||
vocab = NoVocab()
|
vocab = NoVocab()
|
||||||
else:
|
else:
|
||||||
vocab = self._create_vocab_by_path(vocab_types)
|
vocab = self._create_vocab_by_path(vocab_types)
|
||||||
@ -1408,10 +1443,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.no_vocab:
|
if args.no_vocab and args.vocab_only:
|
||||||
if args.vocab_only:
|
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||||
raise ValueError("no need to specify --vocab-only if using --no-vocab")
|
|
||||||
args.vocab_type = "no_vocab"
|
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
model_plus = lazy_load_file(args.model)
|
model_plus = lazy_load_file(args.model)
|
||||||
@ -1433,10 +1466,12 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
params = Params.load(model_plus)
|
params = Params.load(model_plus)
|
||||||
if params.n_ctx == -1:
|
if params.n_ctx == -1:
|
||||||
if args.ctx is None:
|
if args.ctx is None:
|
||||||
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
|
msg = """\
|
||||||
"Please specify one with --ctx:\n"
|
The model doesn't have a context size, and you didn't specify one with --ctx
|
||||||
" - LLaMA v1: --ctx 2048\n"
|
Please specify one with --ctx:
|
||||||
" - LLaMA v2: --ctx 4096\n")
|
- LLaMA v1: --ctx 2048
|
||||||
|
- LLaMA v2: --ctx 4096"""
|
||||||
|
parser.error(textwrap.dedent(msg))
|
||||||
params.n_ctx = args.ctx
|
params.n_ctx = args.ctx
|
||||||
|
|
||||||
if args.outtype:
|
if args.outtype:
|
||||||
@ -1451,9 +1486,11 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
model_parent_path = model_plus.paths[0].parent
|
model_parent_path = model_plus.paths[0].parent
|
||||||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
||||||
vocab_factory = VocabFactory(vocab_path)
|
vocab_factory = VocabFactory(vocab_path)
|
||||||
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
|
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
|
||||||
|
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
|
assert isinstance(vocab, Vocab)
|
||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
|
6
llama.h
6
llama.h
@ -60,9 +60,9 @@ extern "C" {
|
|||||||
|
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
Loading…
Reference in New Issue
Block a user