falcon : use stated vocab size (#2914)

This commit is contained in:
akawrykow 2023-09-14 10:19:42 -07:00 committed by GitHub
parent 990a5e226a
commit 5c872dbca2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"])
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)