llama : model

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-12-22 20:41:05 +02:00
parent 29fd7b56d0
commit ac62ce0236
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 129 additions and 106 deletions

View File

@ -2,7 +2,74 @@
#include "llama-impl.h" #include "llama-impl.h"
std::string llama_model_ftype_name(llama_ftype ftype) { const char * llm_type_name(llm_type type) {
switch (type) {
case MODEL_14M: return "14M";
case MODEL_17M: return "17M";
case MODEL_22M: return "22M";
case MODEL_33M: return "33M";
case MODEL_60M: return "60M";
case MODEL_70M: return "70M";
case MODEL_80M: return "80M";
case MODEL_109M: return "109M";
case MODEL_137M: return "137M";
case MODEL_160M: return "160M";
case MODEL_220M: return "220M";
case MODEL_250M: return "250M";
case MODEL_270M: return "270M";
case MODEL_335M: return "335M";
case MODEL_410M: return "410M";
case MODEL_450M: return "450M";
case MODEL_770M: return "770M";
case MODEL_780M: return "780M";
case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B";
case MODEL_1_3B: return "1.3B";
case MODEL_1_4B: return "1.4B";
case MODEL_1_5B: return "1.5B";
case MODEL_1_6B: return "1.6B";
case MODEL_2B: return "2B";
case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B";
case MODEL_4B: return "4B";
case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
case MODEL_9B: return "9B";
case MODEL_11B: return "11B";
case MODEL_12B: return "12B";
case MODEL_13B: return "13B";
case MODEL_14B: return "14B";
case MODEL_15B: return "15B";
case MODEL_16B: return "16B";
case MODEL_20B: return "20B";
case MODEL_30B: return "30B";
case MODEL_32B: return "32B";
case MODEL_34B: return "34B";
case MODEL_35B: return "35B";
case MODEL_40B: return "40B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
case MODEL_236B: return "236B";
case MODEL_314B: return "314B";
case MODEL_SMALL: return "0.1B";
case MODEL_MEDIUM: return "0.4B";
case MODEL_LARGE: return "0.8B";
case MODEL_XL: return "1.5B";
case MODEL_A1_7B: return "A1.7B";
case MODEL_A2_7B: return "A2.7B";
case MODEL_8x7B: return "8x7B";
case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B";
case MODEL_27B: return "27B";
default: return "?B";
}
}
static std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) { if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
} }
@ -45,6 +112,18 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
} }
} }
std::string llama_model_arch_name (const llama_model & model) {
return llm_arch_name(model.arch);
}
std::string llama_model_type_name (const llama_model & model) {
return llm_type_name(model.type);
}
std::string llama_model_ftype_name(const llama_model & model) {
return llama_model_ftype_name(model.ftype);
}
template<typename F> template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = { ggml_init_params params = {
@ -83,7 +162,8 @@ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & b
} }
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) { ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(*model.dev_layer.at(il).buft_list, return select_buft(
*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) { [&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);

View File

@ -15,8 +15,9 @@
#define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
// available llama models // available models
enum e_model { // TODO: this enum does not follow the enum naming convention
enum llm_type {
MODEL_UNKNOWN, MODEL_UNKNOWN,
MODEL_14M, MODEL_14M,
MODEL_17M, MODEL_17M,
@ -81,73 +82,6 @@ enum e_model {
MODEL_27B, MODEL_27B,
}; };
static const char * llama_model_type_name(e_model type) {
switch (type) {
case MODEL_14M: return "14M";
case MODEL_17M: return "17M";
case MODEL_22M: return "22M";
case MODEL_33M: return "33M";
case MODEL_60M: return "60M";
case MODEL_70M: return "70M";
case MODEL_80M: return "80M";
case MODEL_109M: return "109M";
case MODEL_137M: return "137M";
case MODEL_160M: return "160M";
case MODEL_220M: return "220M";
case MODEL_250M: return "250M";
case MODEL_270M: return "270M";
case MODEL_335M: return "335M";
case MODEL_410M: return "410M";
case MODEL_450M: return "450M";
case MODEL_770M: return "770M";
case MODEL_780M: return "780M";
case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B";
case MODEL_1_3B: return "1.3B";
case MODEL_1_4B: return "1.4B";
case MODEL_1_5B: return "1.5B";
case MODEL_1_6B: return "1.6B";
case MODEL_2B: return "2B";
case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B";
case MODEL_4B: return "4B";
case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
case MODEL_9B: return "9B";
case MODEL_11B: return "11B";
case MODEL_12B: return "12B";
case MODEL_13B: return "13B";
case MODEL_14B: return "14B";
case MODEL_15B: return "15B";
case MODEL_16B: return "16B";
case MODEL_20B: return "20B";
case MODEL_30B: return "30B";
case MODEL_32B: return "32B";
case MODEL_34B: return "34B";
case MODEL_35B: return "35B";
case MODEL_40B: return "40B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
case MODEL_236B: return "236B";
case MODEL_314B: return "314B";
case MODEL_SMALL: return "0.1B";
case MODEL_MEDIUM: return "0.4B";
case MODEL_LARGE: return "0.8B";
case MODEL_XL: return "1.5B";
case MODEL_A1_7B: return "A1.7B";
case MODEL_A2_7B: return "A2.7B";
case MODEL_8x7B: return "8x7B";
case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B";
case MODEL_27B: return "27B";
default: return "?B";
}
}
struct llama_hparams_posnet { struct llama_hparams_posnet {
uint32_t n_embd; uint32_t n_embd;
uint32_t n_layer; uint32_t n_layer;
@ -187,27 +121,27 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
uint32_t n_ff_exp = 0; uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0; uint32_t n_ff_shexp = 0;
uint32_t n_expert_shared = 0; uint32_t n_expert_shared = 0;
float expert_weights_scale = 0.0; uint32_t n_norm_groups = 0;
float expert_weights_scale = 0.0;
float f_norm_eps; float f_norm_eps;
float f_norm_rms_eps; float f_norm_rms_eps;
float f_norm_group_eps; float f_norm_group_eps;
uint32_t n_norm_groups; float f_attn_logit_softcapping = 50.0f;
float f_attn_logit_softcapping = 50.0f;
float f_final_logit_softcapping = 30.0f; float f_final_logit_softcapping = 30.0f;
// for RWKV // for RWKV
uint32_t rescale_every_n_layers = 0; uint32_t rescale_every_n_layers = 0;
uint32_t time_mix_extra_dim = 0; uint32_t time_mix_extra_dim = 0;
uint32_t time_decay_extra_dim = 0; uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0; uint32_t wkv_head_size = 0;
float rope_attn_factor = 1.0f; float rope_attn_factor = 1.0f;
float rope_freq_base_train; float rope_freq_base_train;
@ -221,6 +155,7 @@ struct llama_hparams {
uint32_t ssm_d_inner = 0; uint32_t ssm_d_inner = 0;
uint32_t ssm_d_state = 0; uint32_t ssm_d_state = 0;
uint32_t ssm_dt_rank = 0; uint32_t ssm_dt_rank = 0;
bool ssm_dt_b_c_rms = false; bool ssm_dt_b_c_rms = false;
float f_clamp_kqv = 0.0f; float f_clamp_kqv = 0.0f;
@ -518,8 +453,9 @@ struct llama_layer {
}; };
struct llama_model { struct llama_model {
e_model type = MODEL_UNKNOWN; llm_type type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32; llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a"; std::string name = "n/a";
@ -527,25 +463,25 @@ struct llama_model {
llama_hparams hparams = {}; llama_hparams hparams = {};
llama_vocab vocab; llama_vocab vocab;
struct ggml_tensor * tok_embd = nullptr; struct ggml_tensor * tok_embd = nullptr;
struct ggml_tensor * type_embd = nullptr; struct ggml_tensor * type_embd = nullptr;
struct ggml_tensor * pos_embd = nullptr; struct ggml_tensor * pos_embd = nullptr;
struct ggml_tensor * tok_norm = nullptr; struct ggml_tensor * tok_norm = nullptr;
struct ggml_tensor * tok_norm_b = nullptr; struct ggml_tensor * tok_norm_b = nullptr;
struct ggml_tensor * output_norm = nullptr; struct ggml_tensor * output_norm = nullptr;
struct ggml_tensor * output_norm_b = nullptr; struct ggml_tensor * output_norm_b = nullptr;
struct ggml_tensor * output = nullptr; struct ggml_tensor * output = nullptr;
struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_b = nullptr;
struct ggml_tensor * output_norm_enc = nullptr; struct ggml_tensor * output_norm_enc = nullptr;
// classifier // classifier
struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls = nullptr;
struct ggml_tensor * cls_b = nullptr; struct ggml_tensor * cls_b = nullptr;
struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr; struct ggml_tensor * cls_out_b = nullptr;
struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d = nullptr;
struct ggml_tensor * conv1d_b = nullptr; struct ggml_tensor * conv1d_b = nullptr;
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
@ -611,6 +547,11 @@ struct llama_model {
} }
}; };
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il); const char * llm_type_name(llm_type type);
std::string llama_model_ftype_name(llama_ftype ftype); std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
// TODO: this probably belongs to llama-adapter
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);

View File

@ -1494,6 +1494,8 @@ static void llm_load_hparams(
hparams.n_embd_head_v = 0; hparams.n_embd_head_v = 0;
} }
using e_model = llm_type; // TMP
// arch-specific KVs // arch-specific KVs
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
@ -2999,8 +3001,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
} }
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
if (ml.n_elements >= 1e12) { if (ml.n_elements >= 1e12) {
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12); LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
} else if (ml.n_elements >= 1e9) { } else if (ml.n_elements >= 1e9) {
@ -10252,9 +10254,9 @@ struct llm_build_context {
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch (model.type) { switch (model.type) {
case e_model::MODEL_2B: case llm_type::MODEL_2B:
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case llm_type::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; case llm_type::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
default: GGML_ABORT("fatal error"); default: GGML_ABORT("fatal error");
}; };
cb(Qcur, "Qcur_scaled", il); cb(Qcur, "Qcur_scaled", il);
@ -16505,9 +16507,9 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s", return snprintf(buf, buf_size, "%s %s %s",
llm_arch_name(model->arch), // TODO: llama_model_arch_name(model) llama_model_arch_name (*model).c_str(),
llama_model_type_name(model->type), // TODO: llama_model_type_name(model) llama_model_type_name (*model).c_str(),
llama_model_ftype_name(model->ftype).c_str()); // TODO: llama_model_ftype_name(model) llama_model_ftype_name(*model).c_str());
} }
uint64_t llama_model_size(const struct llama_model * model) { uint64_t llama_model_size(const struct llama_model * model) {