mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
Add byte token type when tokenizer.model is not exists (#4641)
* Add byte token type to hf format * remove unused variable
This commit is contained in:
parent
dc68f0054c
commit
f56d6077d0
10
convert.py
10
convert.py
@ -357,6 +357,7 @@ class VocabLoader:
|
|||||||
for tok in self.tokenizer.all_special_tokens
|
for tok in self.tokenizer.all_special_tokens
|
||||||
}
|
}
|
||||||
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||||
|
self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||||
self.vocab_size_base: int = self.tokenizer.vocab_size
|
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||||
self.fname_tokenizer: Path = fname_tokenizer
|
self.fname_tokenizer: Path = fname_tokenizer
|
||||||
@ -370,15 +371,13 @@ class VocabLoader:
|
|||||||
self.spm = None
|
self.spm = None
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.tokenizer
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
|
||||||
added_tokens_ids = set(self.added_tokens_dict.values())
|
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||||
|
|
||||||
for i in range(self.vocab_size_base):
|
for i in range(self.vocab_size_base):
|
||||||
if i in added_tokens_ids:
|
if i in added_tokens_ids:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = reverse_vocab[i].encode("utf-8")
|
text = self.reverse_vocab[i].encode("utf-8")
|
||||||
yield text, self.get_token_score(i), self.get_token_type(i)
|
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||||
|
|
||||||
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||||
@ -394,10 +393,13 @@ class VocabLoader:
|
|||||||
if self.spm.is_byte(token_id):
|
if self.spm.is_byte(token_id):
|
||||||
toktype = gguf.TokenType.BYTE
|
toktype = gguf.TokenType.BYTE
|
||||||
else:
|
else:
|
||||||
|
token = self.reverse_vocab[token_id]
|
||||||
if token_id == self.unk_token_id:
|
if token_id == self.unk_token_id:
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
if token_id in self.special_ids:
|
elif token_id in self.special_ids:
|
||||||
toktype = gguf.TokenType.CONTROL
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
|
||||||
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
return toktype
|
return toktype
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user