From d79ab101c352e3ac277075d8816df67ddedb10ac Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 18 Apr 2024 18:38:05 +0200 Subject: [PATCH] Support Llama 3 conversion The tokenizer is BPE. --- convert-hf-to-gguf.py | 22 ++++++++++++++-------- convert.py | 9 ++++++++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c14186abb..875a8c550 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1301,15 +1301,21 @@ class LlamaModel(Model): try: self. _set_vocab_sentencepiece() except FileNotFoundError: - self._set_vocab_llama_hf() + try: + self._set_vocab_llama_hf() + except TypeError: + # Llama 3 + self._set_vocab_gpt2() - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): super().set_gguf_parameters() diff --git a/convert.py b/convert.py index 24df0a4d8..1c700cf6a 100755 --- a/convert.py +++ b/convert.py @@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab): # pre-check so we know if we need transformers tokenizer_model: dict[str, Any] = tokenizer_json['model'] - if ( + is_llama3 = ( + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) + ) + if is_llama3: + raise TypeError('Llama 3 must be converted with BpeVocab') + + if not is_llama3 and ( tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) or tokenizer_json['decoder']['type'] != 'Sequence' ):