mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 12:21:40 +01:00
convert_hf : identify which user-defined tokens are control tokens
Only used in _set_vocab_gpt2() for now.
This commit is contained in:
parent
56df1fcdcb
commit
6e351e0425
@ -373,6 +373,18 @@ class Model:
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
||||||
|
|
||||||
|
def does_token_look_special(self, token: str) -> bool:
|
||||||
|
# Some models mark some added tokens which ought to be control tokens as not special.
|
||||||
|
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
||||||
|
is_known_special = token in (
|
||||||
|
"<pad>", # deepseek-coder
|
||||||
|
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
||||||
|
)
|
||||||
|
# TODO: should these be marked as UNUSED instead?
|
||||||
|
is_known_special = is_known_special or (token.startswith("<unused") and token.endswith(">")) # gemma{,-2}
|
||||||
|
|
||||||
|
return is_known_special or (token.startswith(("<|", "<|")) and token.endswith(("|>", "|>")))
|
||||||
|
|
||||||
# used for GPT-2 BPE and WordPiece vocabs
|
# used for GPT-2 BPE and WordPiece vocabs
|
||||||
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
@ -393,8 +405,9 @@ class Model:
|
|||||||
tokens.append(f"[PAD{i}]")
|
tokens.append(f"[PAD{i}]")
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
token: str = reverse_vocab[i]
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
tokens.append(token)
|
||||||
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
else:
|
else:
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
@ -5512,14 +5512,6 @@ static void llm_load_vocab(
|
|||||||
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() &&
|
|
||||||
token_data.text.front() == '<' && token_data.text.back() == '>') {
|
|
||||||
// Some models mark some added tokens which ought to be control tokens as not special.
|
|
||||||
// (e.g. command-r, command-r-plus, deepseek-coder)
|
|
||||||
// TODO: should this be fixed in the convert script instead?
|
|
||||||
token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user