mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
llama : support small Granite models (#7481)
* Add optional MLP bias for Granite models Add optional MLP bias for ARCH_LLAMA to support Granite models. Partially addresses ggerganov/llama.cpp/issues/7116 Still needs some more changes to properly support Granite. * llama: honor add_space_prefix from the model configuration propagate the add_space_prefix configuration from the HF model configuration to the gguf file and honor it with the gpt2 tokenizer. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> * llama: add support for small granite models it works only for the small models 3b and 8b. The convert-hf-to-gguf.py script uses the vocabulary size of the granite models to detect granite and set the correct configuration. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> --------- Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> Co-authored-by: Steffen Roecker <sroecker@redhat.com>
This commit is contained in:
parent
56411a950f
commit
5442939fcc
@ -1317,6 +1317,17 @@ class LlamaModel(Model):
|
|||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
@ -1331,9 +1342,9 @@ class LlamaModel(Model):
|
|||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
|
27
llama.cpp
27
llama.cpp
@ -2028,8 +2028,9 @@ struct llama_layer {
|
|||||||
struct ggml_tensor * ffn_up_shexp;
|
struct ggml_tensor * ffn_up_shexp;
|
||||||
|
|
||||||
// ff bias
|
// ff bias
|
||||||
struct ggml_tensor * ffn_down_b; // b2
|
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||||
struct ggml_tensor * ffn_up_b; // b3
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||||
|
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||||
struct ggml_tensor * ffn_act;
|
struct ggml_tensor * ffn_act;
|
||||||
|
|
||||||
// mamba proj
|
// mamba proj
|
||||||
@ -4058,7 +4059,9 @@ static void llm_load_hparams(
|
|||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 22: model.type = e_model::MODEL_1B; break;
|
case 22: model.type = e_model::MODEL_1B; break;
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
// granite uses a vocab with len 49152
|
||||||
|
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
||||||
|
case 36: model.type = e_model::MODEL_8B; break; // granite
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
case 48: model.type = e_model::MODEL_34B; break;
|
case 48: model.type = e_model::MODEL_34B; break;
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
@ -4328,6 +4331,8 @@ static void llm_load_hparams(
|
|||||||
case 30: model.type = e_model::MODEL_3B; break;
|
case 30: model.type = e_model::MODEL_3B; break;
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
case 40: model.type = e_model::MODEL_15B; break;
|
case 40: model.type = e_model::MODEL_15B; break;
|
||||||
|
case 52: model.type = e_model::MODEL_20B; break; // granite
|
||||||
|
case 88: model.type = e_model::MODEL_34B; break; // granite
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -4590,6 +4595,11 @@ static void llm_load_vocab(
|
|||||||
} else {
|
} else {
|
||||||
if (tokenizer_model == "gpt2") {
|
if (tokenizer_model == "gpt2") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||||
|
|
||||||
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||||
|
if (add_space_prefix_keyidx != -1) {
|
||||||
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||||
@ -5211,6 +5221,11 @@ static bool llm_load_tensors(
|
|||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
|
||||||
|
// optional MLP bias
|
||||||
|
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
|
|
||||||
@ -7483,9 +7498,9 @@ struct llm_build_context {
|
|||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
Loading…
Reference in New Issue
Block a user