Tokenizer SPM fixes for phi-3 and llama-spm (#7375)

* Update brute force test: special tokens
* Fix added tokens
  - Try to read 'added_tokens.json'.
  - Try to read 'tokenizer_config.json'.
  - Try to read 'tokenizer.json'.
* Fix special tokens rtrim

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* server : fix test regexes
This commit is contained in:
jaime-m-p 2024-05-20 20:15:57 +02:00 committed by GitHub
parent fabf30b4c4
commit 917dc8cfa6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 98 additions and 14 deletions

View File

@ -1740,6 +1740,38 @@ class Phi3MiniModel(Model):
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, foken_data in added_tokens_decoder.items():
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
tokenizer_file = self.dir_model / 'tokenizer.json'
if tokenizer_file.is_file():
with open(tokenizer_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get("added_tokens", [])
for foken_data in added_tokens:
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)

View File

@ -37,8 +37,8 @@ Feature: llama.cpp server
Examples: Prompts Examples: Prompts
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated | | prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | | I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not | | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |
Scenario: Completion prompt truncated Scenario: Completion prompt truncated
Given a prompt: Given a prompt:
@ -67,8 +67,8 @@ Feature: llama.cpp server
Examples: Prompts Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |
Scenario Outline: OAI Compatibility w/ response format Scenario Outline: OAI Compatibility w/ response format
@ -84,7 +84,7 @@ Feature: llama.cpp server
| response_format | n_predicted | re_content | | response_format | n_predicted | re_content |
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" | | {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
| {"type": "json_object"} | 10 | \{ " Jacky. | | {"type": "json_object"} | 10 | \{ " Saragine. |
Scenario: Tokenize / Detokenize Scenario: Tokenize / Detokenize

View File

@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
# Since we have cache, this should only process the last tokens # Since we have cache, this should only process the last tokens
Given a user prompt "What is the capital of Germany?" Given a user prompt "What is the capital of Germany?"
And a completion request with no api error And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special) Then 24 tokens are predicted matching (Thank|special|Lily)
And 7 prompt tokens are processed And 7 prompt tokens are processed
# Loading the original cache into slot 0, # Loading the original cache into slot 0,
# we should only be processing 1 prompt token and get the same output # we should only be processing 1 prompt token and get the same output
@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
Given a user prompt "What is the capital of Germany?" Given a user prompt "What is the capital of Germany?"
And using slot id 1 And using slot id 1
And a completion request with no api error And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special) Then 24 tokens are predicted matching (Thank|special|Lily)
And 1 prompt tokens are processed And 1 prompt tokens are processed
Scenario: Erase Slot Scenario: Erase Slot

View File

@ -4553,7 +4553,8 @@ static void llm_load_vocab(
(t.first == "<|eot_id|>" || (t.first == "<|eot_id|>" ||
t.first == "<|im_end|>" || t.first == "<|im_end|>" ||
t.first == "<|end|>" || t.first == "<|end|>" ||
t.first == "<end_of_turn>" t.first == "<end_of_turn>" ||
t.first == "<|endoftext|>"
) )
) { ) {
vocab.special_eot_id = t.second; vocab.special_eot_id = t.second;
@ -12502,6 +12503,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
output.push_back(vocab.special_bos_id); output.push_back(vocab.special_bos_id);
} }
static const bool rtrim = true; //TODO: as param
bool is_prev_special = false;
bool special_token_rtrim = false;
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer // without adding this leading whitespace, we do not get the same results as the original tokenizer
@ -12511,9 +12516,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// and passing 'add space prefix' as bool argument // and passing 'add space prefix' as bool argument
// //
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (&fragment == &fragment_buffer.front()) {
if (special_token_rtrim) {
size_t num_whitespaces = 0;
while (isspace(raw_text[num_whitespaces])) {
num_whitespaces++;
}
if (num_whitespaces == raw_text.size()) {
continue; // skip if all whitespaces
}
raw_text = raw_text.substr(num_whitespaces);
}
if (vocab.add_space_prefix) { if (vocab.add_space_prefix) {
raw_text = " " + raw_text; // prefix with space if the first token is not special if (!output.size() || is_prev_special) { // prefix with space if first token
raw_text = " " + raw_text;
} }
} }
@ -12525,6 +12542,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
tokenizer.tokenize(raw_text, output); tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token); output.push_back(fragment.token);
is_prev_special = true;
// phi-3 special tokens without rtrim, works fine for llama-spm too
special_token_rtrim = rtrim
&& fragment.token != vocab.special_bos_id
&& fragment.token != vocab.special_unk_id
&& fragment.token != vocab.special_eos_id;
} }
} }

View File

@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM) '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'Cửa Việt', # llama-3, ignore_merges = true 'Cửa Việt', # llama-3, ignore_merges = true
'<s>a', # TODO: Phi-3 fail '<s>a', # Phi-3 fail
'<unk><|endoftext|><s>' # Phi-3 fail
'a\na', # TODO: Bert fail 'a\na', # TODO: Bert fail
] ]
def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
special_tokens = set(special_tokens)
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
special_tokens = list(sorted(special_tokens))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
words = rand.choices(special_tokens, k=500)
yield "".join(words)
def generator_vocab_words(vocab: list[str]) -> Iterator[str]: def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
"""Brute force check all vocab words""" """Brute force check all vocab words"""
yield from vocab yield from vocab
@ -289,14 +301,31 @@ def main(argv: list[str] = None):
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
model.free() model.free()
if __name__ == "__main__": if __name__ == "__main__":
main() # main()
path_tokenizers = "./models/tokenizers/"
path_vocab_format = "./models/ggml-vocab-%s.gguf"
# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [
"llama-spm", # SPM
"phi-3", # SPM
]
for tokenizer in tokenizers:
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
vocab_file = path_vocab_format % tokenizer
dir_tokenizer = path_tokenizers + "/" + tokenizer
main([vocab_file, dir_tokenizer, "--verbose"])