Support Llama 3 conversion

The tokenizer is BPE.
This commit is contained in:
Pedro Cuenca 2024-04-18 18:38:05 +02:00
parent 0d56246f4b
commit d79ab101c3
2 changed files with 22 additions and 9 deletions

View File

@ -1301,8 +1301,14 @@ class LlamaModel(Model):
try: try:
self. _set_vocab_sentencepiece() self. _set_vocab_sentencepiece()
except FileNotFoundError: except FileNotFoundError:
try:
self._set_vocab_llama_hf() self._set_vocab_llama_hf()
except TypeError:
# Llama 3
self._set_vocab_gpt2()
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
if self.hparams.get("vocab_size", 32000) == 32016:
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot']) special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("prefix", 32007)

View File

@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers # pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model'] tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ( is_llama3 = (
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
and not tokenizer_model.get('byte_fallback', True)
)
if is_llama3:
raise TypeError('Llama 3 must be converted with BpeVocab')
if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence' or tokenizer_json['decoder']['type'] != 'Sequence'
): ):