mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
test-tokenizer-random : reduce potential confilcts with #8379
* test-tokenizer-random : add a failing edge case for falcon
This commit is contained in:
parent
1caa20fc7a
commit
59ce85318a
@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
|||||||
'a\na', # bert fail
|
'a\na', # bert fail
|
||||||
'"`', # falcon
|
'"`', # falcon
|
||||||
' \u2e4e', # falcon
|
' \u2e4e', # falcon
|
||||||
|
'\n\x0b ', # falcon
|
||||||
'a\xa0\xa0\x00b', # jina-v2-es
|
'a\xa0\xa0\x00b', # jina-v2-es
|
||||||
'one <mask>', # jina-v2-es <mask> lstrip=true
|
'one <mask>', # jina-v2-es <mask> lstrip=true
|
||||||
'a </s> b', # rstrip phi-3
|
'a </s> b', # rstrip phi-3
|
||||||
@ -458,8 +459,8 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
|
|||||||
i = find_first_mismatch(ids1, ids2)
|
i = find_first_mismatch(ids1, ids2)
|
||||||
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
|
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
|
||||||
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
|
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
|
||||||
logger.error(" Expected: " + str(ids1) + f" {[tokenizer1.decode([id]) for id in ids1]}")
|
logger.error(" Expected: " + str(ids1))
|
||||||
logger.error(" Result: " + str(ids2) + f" {[tokenizer2.decode([id]) for id in ids2]}")
|
logger.error(" Result: " + str(ids2))
|
||||||
encode_errors += 1
|
encode_errors += 1
|
||||||
logger.error(f" {encode_errors=}")
|
logger.error(f" {encode_errors=}")
|
||||||
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
|
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user