mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 06:10:29 +01:00
Tokenizer SPM fixes for phi-3 and llama-spm (#7375)
* Update brute force test: special tokens * Fix added tokens - Try to read 'added_tokens.json'. - Try to read 'tokenizer_config.json'. - Try to read 'tokenizer.json'. * Fix special tokens rtrim Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : fix test regexes
This commit is contained in:
parent
fabf30b4c4
commit
917dc8cfa6
@ -1740,6 +1740,38 @@ class Phi3MiniModel(Model):
|
|||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
||||||
|
for token_id, foken_data in added_tokens_decoder.items():
|
||||||
|
token_id = int(token_id)
|
||||||
|
token = foken_data["content"].encode("utf-8")
|
||||||
|
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||||
|
assert(tokens[token_id] == token)
|
||||||
|
tokens[token_id] = token
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
if foken_data.get("special"):
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
|
|
||||||
|
tokenizer_file = self.dir_model / 'tokenizer.json'
|
||||||
|
if tokenizer_file.is_file():
|
||||||
|
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
added_tokens = tokenizer_json.get("added_tokens", [])
|
||||||
|
for foken_data in added_tokens:
|
||||||
|
token_id = int(foken_data["id"])
|
||||||
|
token = foken_data["content"].encode("utf-8")
|
||||||
|
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||||
|
assert(tokens[token_id] == token)
|
||||||
|
tokens[token_id] = token
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
if foken_data.get("special"):
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
@ -37,8 +37,8 @@ Feature: llama.cpp server
|
|||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||||
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
| I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
|
||||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
|
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |
|
||||||
|
|
||||||
Scenario: Completion prompt truncated
|
Scenario: Completion prompt truncated
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
@ -67,8 +67,8 @@ Feature: llama.cpp server
|
|||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
|
||||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility w/ response format
|
Scenario Outline: OAI Compatibility w/ response format
|
||||||
@ -84,7 +84,7 @@ Feature: llama.cpp server
|
|||||||
| response_format | n_predicted | re_content |
|
| response_format | n_predicted | re_content |
|
||||||
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
||||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||||
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
| {"type": "json_object"} | 10 | \{ " Saragine. |
|
||||||
|
|
||||||
|
|
||||||
Scenario: Tokenize / Detokenize
|
Scenario: Tokenize / Detokenize
|
||||||
|
@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
|
|||||||
# Since we have cache, this should only process the last tokens
|
# Since we have cache, this should only process the last tokens
|
||||||
Given a user prompt "What is the capital of Germany?"
|
Given a user prompt "What is the capital of Germany?"
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then 24 tokens are predicted matching (Thank|special)
|
Then 24 tokens are predicted matching (Thank|special|Lily)
|
||||||
And 7 prompt tokens are processed
|
And 7 prompt tokens are processed
|
||||||
# Loading the original cache into slot 0,
|
# Loading the original cache into slot 0,
|
||||||
# we should only be processing 1 prompt token and get the same output
|
# we should only be processing 1 prompt token and get the same output
|
||||||
@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
|
|||||||
Given a user prompt "What is the capital of Germany?"
|
Given a user prompt "What is the capital of Germany?"
|
||||||
And using slot id 1
|
And using slot id 1
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then 24 tokens are predicted matching (Thank|special)
|
Then 24 tokens are predicted matching (Thank|special|Lily)
|
||||||
And 1 prompt tokens are processed
|
And 1 prompt tokens are processed
|
||||||
|
|
||||||
Scenario: Erase Slot
|
Scenario: Erase Slot
|
||||||
|
29
llama.cpp
29
llama.cpp
@ -4553,7 +4553,8 @@ static void llm_load_vocab(
|
|||||||
(t.first == "<|eot_id|>" ||
|
(t.first == "<|eot_id|>" ||
|
||||||
t.first == "<|im_end|>" ||
|
t.first == "<|im_end|>" ||
|
||||||
t.first == "<|end|>" ||
|
t.first == "<|end|>" ||
|
||||||
t.first == "<end_of_turn>"
|
t.first == "<end_of_turn>" ||
|
||||||
|
t.first == "<|endoftext|>"
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
vocab.special_eot_id = t.second;
|
vocab.special_eot_id = t.second;
|
||||||
@ -12502,6 +12503,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const bool rtrim = true; //TODO: as param
|
||||||
|
bool is_prev_special = false;
|
||||||
|
bool special_token_rtrim = false;
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
@ -12511,9 +12516,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||||||
// and passing 'add space prefix' as bool argument
|
// and passing 'add space prefix' as bool argument
|
||||||
//
|
//
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
if (&fragment == &fragment_buffer.front()) {
|
|
||||||
|
if (special_token_rtrim) {
|
||||||
|
size_t num_whitespaces = 0;
|
||||||
|
while (isspace(raw_text[num_whitespaces])) {
|
||||||
|
num_whitespaces++;
|
||||||
|
}
|
||||||
|
if (num_whitespaces == raw_text.size()) {
|
||||||
|
continue; // skip if all whitespaces
|
||||||
|
}
|
||||||
|
raw_text = raw_text.substr(num_whitespaces);
|
||||||
|
}
|
||||||
|
|
||||||
if (vocab.add_space_prefix) {
|
if (vocab.add_space_prefix) {
|
||||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||||
|
raw_text = " " + raw_text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -12525,6 +12542,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
|
is_prev_special = true;
|
||||||
|
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
||||||
|
special_token_rtrim = rtrim
|
||||||
|
&& fragment.token != vocab.special_bos_id
|
||||||
|
&& fragment.token != vocab.special_unk_id
|
||||||
|
&& fragment.token != vocab.special_eos_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
|||||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||||
'Cửa Việt', # llama-3, ignore_merges = true
|
'Cửa Việt', # llama-3, ignore_merges = true
|
||||||
'<s>a', # TODO: Phi-3 fail
|
'<s>a', # Phi-3 fail
|
||||||
|
'<unk><|endoftext|><s>' # Phi-3 fail
|
||||||
'a\na', # TODO: Bert fail
|
'a\na', # TODO: Bert fail
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
|
||||||
|
special_tokens = set(special_tokens)
|
||||||
|
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
|
||||||
|
special_tokens = list(sorted(special_tokens))
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
words = rand.choices(special_tokens, k=500)
|
||||||
|
yield "".join(words)
|
||||||
|
|
||||||
|
|
||||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||||
"""Brute force check all vocab words"""
|
"""Brute force check all vocab words"""
|
||||||
yield from vocab
|
yield from vocab
|
||||||
@ -289,14 +301,31 @@ def main(argv: list[str] = None):
|
|||||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||||
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
|
||||||
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
|
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
|
||||||
|
|
||||||
model.free()
|
model.free()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
# main()
|
||||||
|
|
||||||
|
path_tokenizers = "./models/tokenizers/"
|
||||||
|
path_vocab_format = "./models/ggml-vocab-%s.gguf"
|
||||||
|
|
||||||
|
# import os
|
||||||
|
# tokenizers = os.listdir(path_tokenizers)
|
||||||
|
tokenizers = [
|
||||||
|
"llama-spm", # SPM
|
||||||
|
"phi-3", # SPM
|
||||||
|
]
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
|
||||||
|
vocab_file = path_vocab_format % tokenizer
|
||||||
|
dir_tokenizer = path_tokenizers + "/" + tokenizer
|
||||||
|
main([vocab_file, dir_tokenizer, "--verbose"])
|
||||||
|
Loading…
Reference in New Issue
Block a user