From 748fc8baa3ef6902b7c5cfec2ceedd632775a64d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 27 Mar 2024 16:13:09 -0400 Subject: [PATCH] convert-hf-to-gguf : fix BERT abuse of LlamaHfVocab --- convert-hf-to-gguf.py | 50 +++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 918a90e58..8fe211c33 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -227,15 +227,14 @@ class Model(ABC): return ("pytorch_model.bin",) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - def _set_vocab_gpt2(self): - dir_model = self.dir_model - hparams = self.hparams + # used for GPT-2 BPE and WordPiece vocabs + def get_basic_vocab(self) -> tuple[list[str], list[int]]: tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} @@ -255,11 +254,15 @@ class Model(ABC): tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) + return tokens, toktypes + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes = self.get_basic_vocab() self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_qwen(self): @@ -1694,34 +1697,25 @@ class BertModel(Model): self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): - # use huggingface vocab to get all tokens - vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True) - tokens, scores, toktypes = zip(*vocab.all_tokens()) - assert len(tokens) == vocab.vocab_size - self.vocab_size = vocab.vocab_size + tokens, toktypes = self.get_basic_vocab() + self.vocab_size = len(tokens) # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - n_token_types = len(set(toktypes)) - self.gguf_writer.add_token_type_count(n_token_types) + self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # convert to phantom space vocab - def phantom(tok, typ): - if tok.startswith(b"[") and tok.endswith(b"]"): + def phantom(tok): + if tok.startswith("[") and tok.endswith("]"): return tok - if tok.startswith(b"##"): + if tok.startswith("##"): return tok[2:] - return b"\xe2\x96\x81" + tok - tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) - - # set up bos and eos tokens (cls and sep) - self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) - self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id) + return "\u2581" + tok + tokens = list(map(phantom, tokens)) # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) # handle special tokens @@ -1793,16 +1787,6 @@ class NomicBertModel(BertModel): super().set_gguf_parameters() self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - def get_tensors(self): - assert self.vocab_size is not None - for name, data in super().get_tensors(): - # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. - if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: - rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 - assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) - data = data[:self.vocab_size, :] - yield name, data - @Model.register("GemmaForCausalLM") class GemmaModel(Model):