From 3c86af28f16bcf6f5ed897dc3864d47e6722aae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roberto=20Tom=C3=A1s=20Collins?= Date: Thu, 17 Oct 2024 23:10:54 -0400 Subject: [PATCH 1/2] basic concept --- .gitignore | 190 ++++++++++++++++++++++++++++++++++++++++++ convert_hf_to_gguf.py | 188 +++++++++++++++++++++++++---------------- 2 files changed, 306 insertions(+), 72 deletions(-) diff --git a/.gitignore b/.gitignore index 1092d097a..70f2be418 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,193 @@ poetry.toml # Test models for lora adapters /lora-tests + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index da5feb25b..e3ec2d4cb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -740,8 +740,8 @@ class Model: special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_sentencepiece(self, add_to_gguf=True): - tokens, scores, toktypes = self._create_vocab_sentencepiece() + def _set_vocab_sentencepiece(self, add_to_gguf=True, use_tokenizer_json=False): + tokens, scores, toktypes = self._create_vocab_sentencepiece(use_tokenizer_json) self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre("default") @@ -752,7 +752,7 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _create_vocab_sentencepiece(self): + def _create_vocab_sentencepiece(self, use_tokenizer_json=False): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -760,77 +760,114 @@ class Model: if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) + try: + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [gguf.TokenType.UNUSED] * vocab_size - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE + toktype = gguf.TokenType.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = gguf.TokenType.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = gguf.TokenType.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = gguf.TokenType.UNUSED + elif tokenizer.IsByte(token_id): + toktype = gguf.TokenType.BYTE - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue + # Handle added tokens from added_tokens.json + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = gguf.TokenType.USER_DEFINED - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, token_data in added_tokens_decoder.items(): - token_id = int(token_id) - token: str = token_data["content"] - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token.encode("utf-8"): - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') - if token_data.get("special") or self.does_token_look_special(token): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + # Handle added tokens from tokenizer.json (Salamandra models) + if use_tokenizer_json: + tokenizer_json_file = self.dir_model / 'tokenizer.json' + if tokenizer_json_file.is_file(): + with open(tokenizer_json_file, 'r', encoding='utf-8') as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get('added_tokens', []) + for token_data in added_tokens: + token = token_data.get('content') + token_id = token_data.get('id') + if token is None or token_id is None: + logger.warning(f'Missing token content or id in tokenizer.json: {token_data}') + continue + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue - scores[token_id] = -1000.0 - tokens[token_id] = token.encode("utf-8") + tokens[token_id] = token.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = gguf.TokenType.USER_DEFINED + else: + logger.warning(f"tokenizer.json file not found at {tokenizer_json_file}") - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id_str, token_data in added_tokens_decoder.items(): + token_id = int(token_id_str) + token: str = token_data.get("content") + if token is None: + logger.warning(f'Missing token content in tokenizer_config.json for token_id {token_id}') + continue + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + if toktypes[token_id] != gguf.TokenType.UNUSED: + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') + if token_data.get("special") or self.does_token_look_special(token): + toktypes[token_id] = gguf.TokenType.CONTROL + else: + token = token.replace("\u2581", " ") # pre-normalize user-defined spaces + toktypes[token_id] = gguf.TokenType.USER_DEFINED - return tokens, scores, toktypes + scores[token_id] = -1000.0 + tokens[token_id] = token.encode("utf-8") + else: + logger.debug(f"tokenizer_config.json file not found at {tokenizer_config_file}") + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(f"[PAD{i}]".encode("utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNUSED) + + return tokens, scores, toktypes + + except Exception as e: + logger.error(f"Exception occurred in _create_vocab_sentencepiece: {e}") + raise # Re-raise the exception to handle it appropriately def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -1512,25 +1549,32 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "SalamandraForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: + tokenizer_model_file = self.dir_model / 'tokenizer.model' + tokenizer_json_file = self.dir_model / 'tokenizer.json' + + if tokenizer_model_file.is_file() and tokenizer_json_file.is_file(): + # Handle Salamandra models with both tokenizer.model and tokenizer.json + self._set_vocab_sentencepiece(use_tokenizer_json=True) + else: try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + special_token_types=['prefix', 'suffix', 'middle', 'eot'] ) special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("suffix", 32008) From 730756f9df3232358c8551fa0c5865f7bdef32ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roberto=20Tom=C3=A1s=20Collins?= Date: Thu, 17 Oct 2024 23:10:54 -0400 Subject: [PATCH 2/2] basic concept --- .gitignore | 190 ++++++++++++++++++++++++++++++++++++++++++ convert_hf_to_gguf.py | 188 +++++++++++++++++++++++++---------------- 2 files changed, 306 insertions(+), 72 deletions(-) diff --git a/.gitignore b/.gitignore index 1092d097a..70f2be418 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,193 @@ poetry.toml # Test models for lora adapters /lora-tests + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index da5feb25b..e3ec2d4cb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -740,8 +740,8 @@ class Model: special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_sentencepiece(self, add_to_gguf=True): - tokens, scores, toktypes = self._create_vocab_sentencepiece() + def _set_vocab_sentencepiece(self, add_to_gguf=True, use_tokenizer_json=False): + tokens, scores, toktypes = self._create_vocab_sentencepiece(use_tokenizer_json) self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre("default") @@ -752,7 +752,7 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _create_vocab_sentencepiece(self): + def _create_vocab_sentencepiece(self, use_tokenizer_json=False): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -760,77 +760,114 @@ class Model: if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) + try: + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [gguf.TokenType.UNUSED] * vocab_size - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE + toktype = gguf.TokenType.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = gguf.TokenType.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = gguf.TokenType.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = gguf.TokenType.UNUSED + elif tokenizer.IsByte(token_id): + toktype = gguf.TokenType.BYTE - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue + # Handle added tokens from added_tokens.json + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = gguf.TokenType.USER_DEFINED - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, token_data in added_tokens_decoder.items(): - token_id = int(token_id) - token: str = token_data["content"] - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token.encode("utf-8"): - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') - if token_data.get("special") or self.does_token_look_special(token): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + # Handle added tokens from tokenizer.json (Salamandra models) + if use_tokenizer_json: + tokenizer_json_file = self.dir_model / 'tokenizer.json' + if tokenizer_json_file.is_file(): + with open(tokenizer_json_file, 'r', encoding='utf-8') as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get('added_tokens', []) + for token_data in added_tokens: + token = token_data.get('content') + token_id = token_data.get('id') + if token is None or token_id is None: + logger.warning(f'Missing token content or id in tokenizer.json: {token_data}') + continue + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue - scores[token_id] = -1000.0 - tokens[token_id] = token.encode("utf-8") + tokens[token_id] = token.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = gguf.TokenType.USER_DEFINED + else: + logger.warning(f"tokenizer.json file not found at {tokenizer_json_file}") - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id_str, token_data in added_tokens_decoder.items(): + token_id = int(token_id_str) + token: str = token_data.get("content") + if token is None: + logger.warning(f'Missing token content in tokenizer_config.json for token_id {token_id}') + continue + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + if toktypes[token_id] != gguf.TokenType.UNUSED: + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') + if token_data.get("special") or self.does_token_look_special(token): + toktypes[token_id] = gguf.TokenType.CONTROL + else: + token = token.replace("\u2581", " ") # pre-normalize user-defined spaces + toktypes[token_id] = gguf.TokenType.USER_DEFINED - return tokens, scores, toktypes + scores[token_id] = -1000.0 + tokens[token_id] = token.encode("utf-8") + else: + logger.debug(f"tokenizer_config.json file not found at {tokenizer_config_file}") + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(f"[PAD{i}]".encode("utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNUSED) + + return tokens, scores, toktypes + + except Exception as e: + logger.error(f"Exception occurred in _create_vocab_sentencepiece: {e}") + raise # Re-raise the exception to handle it appropriately def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -1512,25 +1549,32 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "SalamandraForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: + tokenizer_model_file = self.dir_model / 'tokenizer.model' + tokenizer_json_file = self.dir_model / 'tokenizer.json' + + if tokenizer_model_file.is_file() and tokenizer_json_file.is_file(): + # Handle Salamandra models with both tokenizer.model and tokenizer.json + self._set_vocab_sentencepiece(use_tokenizer_json=True) + else: try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + special_token_types=['prefix', 'suffix', 'middle', 'eot'] ) special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("suffix", 32008)