feat: Introduce VocabFactory for flexible vocabulary management in model conversion

- The VocabFactory class is added to facilitate modular vocabulary handling. - The constructor initializes a directory path and detects vocabulary-related files. - The _select_file method provides file paths based on vocabulary type (e.g., BPE, SentencePiece). - _create_special_vocab generates special vocabularies, accommodating different types. - The load_vocab method loads vocabularies, handling BPE, SentencePiece, and Hugging Face Fast Tokenizer. - Error handling and logging enhance debugging and user feedback. - The modular and flexible design simplifies vocabulary management and supports future extensions. The VocabFactory class enhances code modularity and maintainability, allowing versatile vocabulary handling in the model conversion process.
2024-12-26 14:20:31 +01:00 · 2024-01-07 21:32:42 -05:00 · 2024-01-07 21:32:42 -05:00 · 8aa5818a20
commit 8aa5818a20
parent 5fa1a08c2f
1 changed files with 77 additions and 0 deletions
--- a/convert.py
+++ b/convert.py
@ -1355,6 +1355,83 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
        self.files = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
        }
        self._detect_files()
    def _detect_files(self):
        for file in self.files.keys():
            file_path = self.path / file
            parent_file_path = self.path.parent / file
            if file_path.exists():
                self.files[file] = file_path
            elif parent_file_path.exists():
                self.files[file] = parent_file_path
    def _select_file(self, vocabtype: Optional[str]) -> Path:
        if vocabtype in ["spm", "bpe"]:
            # For SentencePiece and BPE, return specific files as before
            file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
            if self.files[file_key]:
                return self.files[file_key]
            else:
                raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
        elif vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
    def _create_special_vocab(
        self,
        vocab: Vocab,
        vocabtype: str,
        model_parent_path: Path,
    ) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
            model_parent_path,
            load_merges=load_merges,
            special_token_types=None,  # Predetermined or passed as a parameter
            n_vocab=n_vocab,
        )
    def load_vocab(
        self, vocabtype: str, model_parent_path: Path
    ) -> Tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")
        added_tokens_path = path.parent / "added_tokens.json"
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
        elif vocabtype == "spm":
            vocab = SentencePieceVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
        elif vocabtype == "hfft":
            vocab = HfVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
            model_parent_path,
        )
        return vocab, special_vocab
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
        GGMLFileType.AllF32:    "f32",