mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 22:30:32 +01:00
Refine Model Hyperparameters and Params Class
- Updated type annotations to use `Optional` for clarity. - Improved method names and attribute consistency. - Removed unnecessary variables for better code readability. Additional Notes: - Highlighted the use of `Optional` for clearer intent. - Ensured backward and forward compatibility.
This commit is contained in:
parent
acf8f4b20f
commit
15e18973da
84
convert.py
84
convert.py
@ -189,37 +189,57 @@ class Params:
|
|||||||
n_ff: int
|
n_ff: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
n_experts: int | None = None
|
f_norm_eps: Optional[float] = None
|
||||||
n_experts_used: int | None = None
|
n_experts: Optional[int] = None
|
||||||
f_norm_eps: float | None = None
|
n_experts_used: Optional[int] = None
|
||||||
|
|
||||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
rope_scaling_type: Optional[gguf.RopeScalingType] = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: Optional[float] = None
|
||||||
f_rope_scale: float | None = None
|
f_rope_scale: Optional[float] = None
|
||||||
n_orig_ctx: int | None = None
|
n_orig_ctx: Optional[int] = None
|
||||||
rope_finetuned: bool | None = None
|
rope_finetuned: Optional[bool] = None
|
||||||
|
|
||||||
ftype: GGMLFileType | None = None
|
ftype: Optional[GGMLFileType] = None
|
||||||
|
|
||||||
# path to the directory containing the model files
|
# path to the directory containing the model files
|
||||||
path_model: Path | None = None
|
path_model: Optional[Path] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: LazyModel) -> Params:
|
def guessed(model: LazyModel) -> "Params":
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
n_vocab, n_embd = (
|
||||||
|
model["model.embed_tokens.weight"].shape
|
||||||
|
if "model.embed_tokens.weight" in model
|
||||||
|
else model["tok_embeddings.weight"].shape
|
||||||
|
)
|
||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer = next(
|
||||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
i
|
||||||
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
for i in itertools.count()
|
||||||
|
if f"model.layers.{i}.self_attn.q_proj.weight" not in model
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
"model.layers.0.self_attn.W_pack.weight" in model
|
||||||
|
): # next: try baichuan naming
|
||||||
|
n_layer = next(
|
||||||
|
i
|
||||||
|
for i in itertools.count()
|
||||||
|
if f"model.layers.{i}.self_attn.W_pack.weight" not in model
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(
|
||||||
|
i
|
||||||
|
for i in itertools.count()
|
||||||
|
if f"layers.{i}.attention.wq.weight" not in model
|
||||||
|
)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
raise Exception(
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
|
||||||
|
)
|
||||||
|
|
||||||
n_head = n_embd // 128 # guessed
|
n_head = n_embd // 128 # guessed
|
||||||
n_mult = 256 # guessed
|
n_mult = 256 # guessed
|
||||||
@ -240,7 +260,7 @@ class Params:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
@ -253,18 +273,20 @@ class Params:
|
|||||||
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
elif typ == "yarn":
|
elif typ == "yarn":
|
||||||
rope_scaling_type = gguf.RopeScalingType.YARN
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
n_orig_ctx = rope_scaling["original_max_position_embeddings"]
|
||||||
rope_finetuned = rope_scaling['finetuned']
|
rope_finetuned = rope_scaling["finetuned"]
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
raise NotImplementedError(f"Unknown rope scaling type: {typ}")
|
||||||
|
|
||||||
if "max_sequence_length" in config:
|
if "max_sequence_length" in config:
|
||||||
n_ctx = config["max_sequence_length"]
|
n_ctx = config["max_sequence_length"]
|
||||||
elif "max_position_embeddings" in config:
|
elif "max_position_embeddings" in config:
|
||||||
n_ctx = config["max_position_embeddings"]
|
n_ctx = config["max_position_embeddings"]
|
||||||
else:
|
else:
|
||||||
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
raise Exception(
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
||||||
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
|
||||||
|
)
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
@ -294,7 +316,7 @@ class Params:
|
|||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
@ -325,7 +347,7 @@ class Params:
|
|||||||
f_rope_freq_base = 1e6
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
||||||
n_embd=config["dim"],
|
n_embd=config["dim"],
|
||||||
n_layer=config["n_layers"],
|
n_layer=config["n_layers"],
|
||||||
n_ctx=n_ctx,
|
n_ctx=n_ctx,
|
||||||
@ -339,18 +361,18 @@ class Params:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(model_plus: ModelPlus) -> Params:
|
def load(model_plus: ModelPlus) -> "Params":
|
||||||
hf_config_path = model_plus.paths[0].parent / "config.json"
|
hf_config_path = model_plus.paths[0].parent / "config.json"
|
||||||
orig_config_path = model_plus.paths[0].parent / "params.json"
|
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||||
|
|
||||||
if hf_config_path.exists():
|
if hf_config_path.exists():
|
||||||
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
|
params = Params.load_transformers_config(model_plus.model, hf_config_path)
|
||||||
elif orig_config_path.exists():
|
elif orig_config_path.exists():
|
||||||
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
|
params = Params.load_torch_params(model_plus.model, orig_config_path)
|
||||||
elif model_plus.format != 'none':
|
elif model_plus.format != "none":
|
||||||
params = Params.guessed(model_plus.model)
|
params = Params.guessed(model_plus.model)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Cannot guess params when model format is none')
|
raise ValueError("Cannot guess params when model format is none")
|
||||||
|
|
||||||
params.path_model = model_plus.paths[0].parent
|
params.path_model = model_plus.paths[0].parent
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user