From 7e4a4ebc1086ee16f21e4139b4b911afc0588ba3 Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Sun, 7 Jan 2024 20:20:38 -0500 Subject: [PATCH] refactor: Enhance readability, functionality, and code quality - Improved code formatting and readability for better maintainability. - Refactored LazyUnpickler's CLASSES dictionary for clarity. - Added print statements and warnings in check_vocab_size for user feedback. - Removed find_vocab_file_path, as it's superseded by VocabFactory. - Preparatory changes for upcoming classes: OutputFile and VocabFactory. - Overall focus on code quality, error handling, and consistency. These changes reflect a continuous effort to refine the codebase, ensuring it meets best practices and prepares for future enhancements, such as the VocabFactory. --- convert.py | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/convert.py b/convert.py index 10642ec8f..095a1020e 100755 --- a/convert.py +++ b/convert.py @@ -868,13 +868,17 @@ class LazyUnpickler(pickle.Unpickler): CLASSES: dict[tuple[str, str], Any] = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. - ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), - ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), - ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), - ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), - ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), - ('torch', 'IntStorage'): LazyStorageKind(DT_I32), - ('torch', 'Tensor'): LazyTensor, + ("torch._tensor", "_rebuild_from_type_v2"): getattr( + rebuild_from_type_v2, "__func__" + ), + ("torch._utils", "_rebuild_tensor_v2"): getattr( + lazy_rebuild_tensor_v2, "__func__" + ), + ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), + ("torch", "HalfStorage"): LazyStorageKind(DT_F16), + ("torch", "FloatStorage"): LazyStorageKind(DT_F32), + ("torch", "IntStorage"): LazyStorageKind(DT_I32), + ("torch", "Tensor"): LazyTensor, } def find_class(self, module: str, name: str) -> Any: @@ -985,16 +989,17 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: if params.n_vocab != vocab.vocab_size: if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") - vocab.added_tokens_dict = OrderedDict() - vocab.vocab_size = vocab.vocab_size + print( + "Ignoring added_tokens.json since model matches vocab size without it." + ) return - if pad_vocab and params.n_vocab > vocab.vocab_size: pad_count = params.n_vocab - vocab.vocab_size - print(f'Padding vocab with {pad_count} token(s) - through ') + print( + f"Padding vocab with {pad_count} token(s) - through " + ) for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): - vocab.added_tokens_dict[f''] = -1 + vocab.added_tokens_dict[f""] = -1 vocab.vocab_size = params.n_vocab return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" @@ -1002,7 +1007,14 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." if vocab.vocab_size < params.n_vocab: - msg += " Possibly try using the --padvocab option." + msg += " Add the --pad-vocab option and try again." + + # Check if params.n_vocab is -1 and issue a warning + if params.n_vocab == -1: + warnings.warn( + "WARNING: The model's vocab size is set to -1 in params.json. Please update it manually." + ) + raise Exception(msg) @@ -1289,19 +1301,6 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus -def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]: - path2 = path / vocab_file - # Use `.parent` instead of /.. to handle the symlink case better. - path3 = path.parent / vocab_file - - if path2.exists(): - return path2 - if path3.exists(): - return path3 - - return None - - def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32",