py : improve BPE tokenizer support (#5189)

This commit is contained in:
Sang-Kil Park 2024-01-29 18:24:19 +09:00 committed by GitHub
parent fbe7dfa53c
commit e76627bcce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -334,7 +334,10 @@ class Params:
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"]
try:
self.vocab = self.bpe_tokenizer["model"]["vocab"]
except:
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.