Support Llama 3 conversion

The tokenizer is BPE.
2025-01-13 22:08:55 +01:00 · 2024-04-18 18:38:05 +02:00 · 2024-04-18 18:38:05 +02:00 · d79ab101c3
commit d79ab101c3
parent 0d56246f4b
2 changed files with 22 additions and 9 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1301,15 +1301,21 @@ class LlamaModel(Model):
        try:
            self. _set_vocab_sentencepiece()
        except FileNotFoundError:
-            self._set_vocab_llama_hf()
+            try:
+                self._set_vocab_llama_hf()
+            except TypeError:
+                # Llama 3
+                self._set_vocab_gpt2()

-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
-                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
-        special_vocab._set_special_token("prefix", 32007)
-        special_vocab._set_special_token("suffix", 32008)
-        special_vocab._set_special_token("middle", 32009)
-        special_vocab._set_special_token("eot",    32010)
-        special_vocab.add_to_gguf(self.gguf_writer)
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        if self.hparams.get("vocab_size", 32000) == 32016:
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+                                            special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
+            special_vocab._set_special_token("prefix", 32007)
+            special_vocab._set_special_token("suffix", 32008)
+            special_vocab._set_special_token("middle", 32009)
+            special_vocab._set_special_token("eot",    32010)
+            special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
--- a/convert.py
+++ b/convert.py
@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if (
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):