mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
convert : ignore tokens if their IDs are within [0, vocab_size) (#3831)
This commit is contained in:
parent
bd6d9e2059
commit
8a2f2fea29
19
convert.py
19
convert.py
@ -366,15 +366,18 @@ class SentencePieceVocab:
|
|||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
self.vocab_size_base: int = vocab_size
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
|
if expected_new_ids != actual_new_ids:
|
||||||
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user