From 6e351e04252e5956432078f673d69b5f19de318d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 7 Jul 2024 16:59:00 -0400 Subject: [PATCH] convert_hf : identify which user-defined tokens are control tokens Only used in _set_vocab_gpt2() for now. --- convert_hf_to_gguf.py | 17 +++++++++++++++-- src/llama.cpp | 8 -------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6cea73f08..30f87a9fe 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -373,6 +373,18 @@ class Model: except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + def does_token_look_special(self, token: str) -> bool: + # Some models mark some added tokens which ought to be control tokens as not special. + # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) + is_known_special = token in ( + "", # deepseek-coder + "", "<2mass>", "[@BOS@]", # gemma{,-2} + ) + # TODO: should these be marked as UNUSED instead? + is_known_special = is_known_special or (token.startswith("")) # gemma{,-2} + + return is_known_special or (token.startswith(("<|", "<|")) and token.endswith(("|>", "|>"))) + # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -393,8 +405,9 @@ class Model: tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: + token: str = reverse_vocab[i] + tokens.append(token) + if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: toktypes.append(gguf.TokenType.USER_DEFINED) diff --git a/src/llama.cpp b/src/llama.cpp index 1794ec2bd..11147eb11 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5512,14 +5512,6 @@ static void llm_load_vocab( default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; } } - - if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() && - token_data.text.front() == '<' && token_data.text.back() == '>') { - // Some models mark some added tokens which ought to be control tokens as not special. - // (e.g. command-r, command-r-plus, deepseek-coder) - // TODO: should this be fixed in the convert script instead? - token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; - } } GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());