mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
py : fix missing added_tokens_dict for SPM and BPE vocabs (#4971)
* py : fix missing added_tokens_dict for SPM vocab * py : pad with unknown tokens when data is missing ggml-ci * py : fix BPE vocab conversion ggml-ci * py : fix padded dummy tokens (I hope)
This commit is contained in:
parent
2b3a665d39
commit
4f4bf35f46
20
convert.py
20
convert.py
@ -387,6 +387,7 @@ class BpeVocab: # GPT
|
|||||||
self.bpe_tokenizer = json.loads(
|
self.bpe_tokenizer = json.loads(
|
||||||
open(str(fname_tokenizer), encoding="utf-8").read()
|
open(str(fname_tokenizer), encoding="utf-8").read()
|
||||||
)
|
)
|
||||||
|
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
@ -405,7 +406,7 @@ class BpeVocab: # GPT
|
|||||||
if item["content"] not in self.bpe_tokenizer
|
if item["content"] not in self.bpe_tokenizer
|
||||||
)
|
)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.vocab)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
actual_ids = sorted(added_tokens.values())
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
@ -415,6 +416,7 @@ class BpeVocab: # GPT
|
|||||||
)
|
)
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base: int = vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
@ -422,10 +424,9 @@ class BpeVocab: # GPT
|
|||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(self.vocab):
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
@ -466,6 +467,7 @@ class SentencePieceVocab: # LlaMa
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
self.vocab_size_base = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
|||||||
)
|
)
|
||||||
for i in range(1, pad_count + 1):
|
for i in range(1, pad_count + 1):
|
||||||
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
|
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
|
||||||
|
vocab.added_tokens_list.append(f"<dummy{i:05}>")
|
||||||
vocab.vocab_size = params.n_vocab
|
vocab.vocab_size = params.n_vocab
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -1097,6 +1100,8 @@ class OutputFile:
|
|||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
assert(len(tokens) == vocab.vocab_size)
|
||||||
|
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||||
@ -1373,15 +1378,14 @@ class VocabFactory:
|
|||||||
self.files[file] = file_path
|
self.files[file] = file_path
|
||||||
elif parent_file_path.exists():
|
elif parent_file_path.exists():
|
||||||
self.files[file] = parent_file_path
|
self.files[file] = parent_file_path
|
||||||
|
print(f"Found vocab files: {self.files}")
|
||||||
|
|
||||||
def _select_file(self, vocabtype: Optional[str]) -> Path:
|
def _select_file(self, vocabtype: Optional[str]) -> Path:
|
||||||
if vocabtype in ["spm", "bpe"]:
|
if vocabtype in ["spm", "bpe"]:
|
||||||
# For SentencePiece and BPE, return specific files as before
|
for file_key in self.files.keys():
|
||||||
file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
|
|
||||||
if self.files[file_key]:
|
if self.files[file_key]:
|
||||||
return self.files[file_key]
|
return self.files[file_key]
|
||||||
else:
|
raise FileNotFoundError(f"{vocabtype} vocab not found.")
|
||||||
raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
|
|
||||||
elif vocabtype == "hfft":
|
elif vocabtype == "hfft":
|
||||||
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
||||||
return self.path
|
return self.path
|
||||||
|
Loading…
x
Reference in New Issue
Block a user