From 4f407a0a353dae4726c74cc33250b623a4911dd7 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 10 Apr 2024 17:24:14 +0200 Subject: [PATCH] llama : add model types for mixtral (#6589) --- llama.cpp | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index 8dbf47486..9ad9b10cb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1705,6 +1705,8 @@ enum e_model { MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, + MODEL_8x7B, + MODEL_8x22B, }; static const size_t kiB = 1024; @@ -3558,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) { case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; case MODEL_XL: return "1.5B"; + case MODEL_8x7B: return "8x7B"; + case MODEL_8x22B: return "8x22B"; default: return "?B"; } } @@ -3672,15 +3676,23 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 22: model.type = e_model::MODEL_1B; break; - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 48: model.type = e_model::MODEL_34B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; - default: model.type = e_model::MODEL_UNKNOWN; + if (hparams.n_expert == 8) { + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_8x7B; break; + case 56: model.type = e_model::MODEL_8x22B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } else { + switch (hparams.n_layer) { + case 22: model.type = e_model::MODEL_1B; break; + case 26: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 48: model.type = e_model::MODEL_34B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } } } break; case LLM_ARCH_MINICPM: