mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 03:12:23 +01:00
convert : fix vocab size when not defined in hparams (#3421)
This commit is contained in:
parent
e78f0b0d05
commit
1c84003c08
@ -134,26 +134,19 @@ print("gguf: get tokenizer metadata")
|
|||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
byte_encoder = bytes_to_unicode()
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
@ -131,24 +131,19 @@ print("gguf: get tokenizer metadata")
|
|||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
byte_encoder = bytes_to_unicode()
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
@ -118,26 +118,19 @@ print("gguf: get tokenizer metadata")
|
|||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
byte_encoder = bytes_to_unicode()
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
Loading…
Reference in New Issue
Block a user