Refine Model Hyperparameters and Params Class

- Updated type annotations to use `Optional` for clarity. - Improved method names and attribute consistency. - Removed unnecessary variables for better code readability. Additional Notes: - Highlighted the use of `Optional` for clearer intent. - Ensured backward and forward compatibility.
2024-12-26 22:30:32 +01:00 · 2024-01-07 19:25:07 -05:00 · 2024-01-07 19:25:07 -05:00 · 15e18973da
commit 15e18973da
parent acf8f4b20f
1 changed files with 99 additions and 77 deletions
--- a/convert.py
+++ b/convert.py
@ -189,37 +189,57 @@ class Params:
    n_ff: int
    n_head: int
    n_head_kv: int
-    n_experts:      int | None = None
+    f_norm_eps: Optional[float] = None
-    n_experts_used: int | None = None
+    n_experts: Optional[int] = None
-    f_norm_eps:     float | None = None
+    n_experts_used: Optional[int] = None
-    rope_scaling_type: gguf.RopeScalingType | None = None
+    rope_scaling_type: Optional[gguf.RopeScalingType] = None
-    f_rope_freq_base: float | None = None
+    f_rope_freq_base: Optional[float] = None
-    f_rope_scale: float | None = None
+    f_rope_scale: Optional[float] = None
-    n_orig_ctx: int | None = None
+    n_orig_ctx: Optional[int] = None
-    rope_finetuned: bool | None = None
+    rope_finetuned: Optional[bool] = None
-    ftype: GGMLFileType | None = None
+    ftype: Optional[GGMLFileType] = None
    # path to the directory containing the model files
-    path_model: Path | None = None
+    path_model: Optional[Path] = None
    @staticmethod
-    def guessed(model: LazyModel) -> Params:
+    def guessed(model: LazyModel) -> "Params":
        # try transformer naming first
-        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+        n_vocab, n_embd = (
            model["model.embed_tokens.weight"].shape
            if "model.embed_tokens.weight" in model
            else model["tok_embeddings.weight"].shape
        )
        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+            n_layer = next(
-        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+                i
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+                for i in itertools.count()
                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
            )
        elif (
            "model.layers.0.self_attn.W_pack.weight" in model
        ):  # next: try baichuan naming
            n_layer = next(
                i
                for i in itertools.count()
                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
            )
        else:
-            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+            n_layer = next(
                i
                for i in itertools.count()
                if f"layers.{i}.attention.wq.weight" not in model
            )
        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+            raise Exception(
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
        n_head = n_embd // 128  # guessed
        n_mult = 256  # guessed
@ -240,7 +260,7 @@ class Params:
        )
    @staticmethod
-    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
+    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
        config = json.load(open(config_path))
        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -253,18 +273,20 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
-                rope_finetuned = rope_scaling['finetuned']
+                rope_finetuned = rope_scaling["finetuned"]
            else:
-                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
+                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+            raise Exception(
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
        n_experts = None
        n_experts_used = None
@ -294,7 +316,7 @@ class Params:
    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
+    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
        config = json.load(open(config_path))
        n_experts = None
@ -325,7 +347,7 @@ class Params:
            f_rope_freq_base = 1e6
        return Params(
-            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
            n_embd=config["dim"],
            n_layer=config["n_layers"],
            n_ctx=n_ctx,
@ -339,18 +361,18 @@ class Params:
        )
    @staticmethod
-    def load(model_plus: ModelPlus) -> Params:
+    def load(model_plus: ModelPlus) -> "Params":
        hf_config_path = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"
        if hf_config_path.exists():
-            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+            params = Params.load_transformers_config(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+            params = Params.load_torch_params(model_plus.model, orig_config_path)
-        elif model_plus.format != 'none':
+        elif model_plus.format != "none":
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError('Cannot guess params when model format is none')
+            raise ValueError("Cannot guess params when model format is none")
        params.path_model = model_plus.paths[0].parent