mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 06:10:29 +01:00
convert : handle tokenizer merges format from transformers 4.45 (#9696)
This commit is contained in:
parent
841713e1e4
commit
e3c355ba65
@ -122,8 +122,30 @@ class SpecialVocab:
|
||||
tokenizer = json.load(f)
|
||||
if self.load_merges:
|
||||
merges = tokenizer.get('model', {}).get('merges')
|
||||
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
||||
self.merges = merges
|
||||
if isinstance(merges, list) and merges:
|
||||
if isinstance(merges[0], str):
|
||||
self.merges = merges
|
||||
elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
|
||||
# New format since transformers 4.45 to support spaces in merges
|
||||
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
|
||||
# TODO: internally store as the new format instead of converting to old
|
||||
if any(' ' in s for pair in merges for s in pair):
|
||||
logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
|
||||
self.merges = [
|
||||
' '.join(
|
||||
[
|
||||
# ensure the spaces are properly encoded
|
||||
''.join(
|
||||
chr(ord(c) + 256) if c == ' ' else c
|
||||
for c in part
|
||||
)
|
||||
for part in pair
|
||||
]
|
||||
)
|
||||
for pair in merges
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unknown tokenizer merges format")
|
||||
added_tokens = tokenizer.get('added_tokens', {})
|
||||
else:
|
||||
added_tokens = {}
|
||||
|
Loading…
Reference in New Issue
Block a user