mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 20:43:07 +01:00
2176 lines
100 KiB
C++
2176 lines
100 KiB
C++
#include "llama-model.h"
|
||
|
||
#include "llama-impl.h"
|
||
#include "llama-model-loader.h"
|
||
|
||
#include "unicode.h" // TODO: remove
|
||
|
||
#include <algorithm>
|
||
#include <cassert>
|
||
#include <functional>
|
||
#include <sstream>
|
||
#include <stdexcept>
|
||
|
||
static const size_t kiB = 1024;
|
||
static const size_t MiB = 1024*kiB;
|
||
static const size_t GiB = 1024*MiB;
|
||
|
||
const char * llm_type_name(llm_type type) {
|
||
switch (type) {
|
||
case MODEL_14M: return "14M";
|
||
case MODEL_17M: return "17M";
|
||
case MODEL_22M: return "22M";
|
||
case MODEL_33M: return "33M";
|
||
case MODEL_60M: return "60M";
|
||
case MODEL_70M: return "70M";
|
||
case MODEL_80M: return "80M";
|
||
case MODEL_109M: return "109M";
|
||
case MODEL_137M: return "137M";
|
||
case MODEL_160M: return "160M";
|
||
case MODEL_220M: return "220M";
|
||
case MODEL_250M: return "250M";
|
||
case MODEL_270M: return "270M";
|
||
case MODEL_335M: return "335M";
|
||
case MODEL_410M: return "410M";
|
||
case MODEL_450M: return "450M";
|
||
case MODEL_770M: return "770M";
|
||
case MODEL_780M: return "780M";
|
||
case MODEL_0_5B: return "0.5B";
|
||
case MODEL_1B: return "1B";
|
||
case MODEL_1_3B: return "1.3B";
|
||
case MODEL_1_4B: return "1.4B";
|
||
case MODEL_1_5B: return "1.5B";
|
||
case MODEL_1_6B: return "1.6B";
|
||
case MODEL_2B: return "2B";
|
||
case MODEL_2_8B: return "2.8B";
|
||
case MODEL_3B: return "3B";
|
||
case MODEL_4B: return "4B";
|
||
case MODEL_6B: return "6B";
|
||
case MODEL_6_9B: return "6.9B";
|
||
case MODEL_7B: return "7B";
|
||
case MODEL_8B: return "8B";
|
||
case MODEL_9B: return "9B";
|
||
case MODEL_11B: return "11B";
|
||
case MODEL_12B: return "12B";
|
||
case MODEL_13B: return "13B";
|
||
case MODEL_14B: return "14B";
|
||
case MODEL_15B: return "15B";
|
||
case MODEL_16B: return "16B";
|
||
case MODEL_20B: return "20B";
|
||
case MODEL_30B: return "30B";
|
||
case MODEL_32B: return "32B";
|
||
case MODEL_34B: return "34B";
|
||
case MODEL_35B: return "35B";
|
||
case MODEL_40B: return "40B";
|
||
case MODEL_65B: return "65B";
|
||
case MODEL_70B: return "70B";
|
||
case MODEL_236B: return "236B";
|
||
case MODEL_314B: return "314B";
|
||
case MODEL_SMALL: return "0.1B";
|
||
case MODEL_MEDIUM: return "0.4B";
|
||
case MODEL_LARGE: return "0.8B";
|
||
case MODEL_XL: return "1.5B";
|
||
case MODEL_A1_7B: return "A1.7B";
|
||
case MODEL_A2_7B: return "A2.7B";
|
||
case MODEL_8x7B: return "8x7B";
|
||
case MODEL_8x22B: return "8x22B";
|
||
case MODEL_16x12B: return "16x12B";
|
||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||
case MODEL_57B_A14B: return "57B.A14B";
|
||
case MODEL_27B: return "27B";
|
||
default: return "?B";
|
||
}
|
||
}
|
||
|
||
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||
}
|
||
|
||
switch (ftype) {
|
||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||
|
||
default: return "unknown, may not work";
|
||
}
|
||
}
|
||
|
||
std::string llama_model_arch_name (const llama_model & model) {
|
||
return llm_arch_name(model.arch);
|
||
}
|
||
|
||
std::string llama_model_type_name (const llama_model & model) {
|
||
return llm_type_name(model.type);
|
||
}
|
||
|
||
std::string llama_model_ftype_name(const llama_model & model) {
|
||
return llama_model_ftype_name(model.ftype);
|
||
}
|
||
|
||
template<typename F>
|
||
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||
ggml_init_params params = {
|
||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||
/*.mem_buffer =*/ NULL,
|
||
/*.no_alloc =*/ true,
|
||
};
|
||
|
||
ggml_context_ptr ctx { ggml_init(params) };
|
||
if (!ctx) {
|
||
throw std::runtime_error(format("failed to create ggml context"));
|
||
}
|
||
|
||
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||
ggml_tensor * op_tensor = fn(ctx.get());
|
||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||
if (op_tensor->src[i] != nullptr) {
|
||
assert(op_tensor->src[i]->buffer == nullptr);
|
||
op_tensor->src[i]->buffer = buf.get();
|
||
}
|
||
}
|
||
|
||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||
|
||
return op_supported;
|
||
}
|
||
|
||
template<typename F>
|
||
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||
for (const auto & cur : buft_list) {
|
||
ggml_backend_dev_t cur_dev = cur.first;
|
||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||
return cur_buft;
|
||
}
|
||
}
|
||
|
||
throw std::runtime_error(format("no suitable buffer type found"));
|
||
}
|
||
|
||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
|
||
return select_buft(
|
||
*model.dev_layer.at(il).buft_list,
|
||
[&](ggml_context * ctx) {
|
||
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||
return ggml_add(ctx, cur, layer_dir);
|
||
});
|
||
}
|
||
|
||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) {
|
||
auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(),
|
||
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
||
return it.first == name;
|
||
});
|
||
if (it == model.tensors_by_name.end()) {
|
||
return nullptr;
|
||
}
|
||
|
||
return it->second;
|
||
}
|
||
|
||
size_t llama_model_max_nodes(const llama_model & model) {
|
||
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
|
||
}
|
||
|
||
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
||
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
||
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
||
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
||
};
|
||
|
||
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||
if (kv.second == name) {
|
||
return (llama_rope_scaling_type) kv.first;
|
||
}
|
||
}
|
||
|
||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||
}
|
||
|
||
// NOTE: avoid ever using this except for building the token_to_piece caches
|
||
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
||
std::string piece;
|
||
piece.resize(piece.capacity()); // using string internal cache
|
||
const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||
if (n_chars < 0) {
|
||
piece.resize(-n_chars);
|
||
int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||
GGML_ASSERT(check == -n_chars);
|
||
}
|
||
else {
|
||
piece.resize(n_chars);
|
||
}
|
||
|
||
return piece;
|
||
}
|
||
|
||
void llm_load_stats(llama_model_loader & ml, llama_model & model) {
|
||
model.n_elements = ml.n_elements;
|
||
model.n_bytes = ml.n_bytes;
|
||
}
|
||
|
||
void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||
model.arch = ml.get_arch();
|
||
if (model.arch == LLM_ARCH_UNKNOWN) {
|
||
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
|
||
}
|
||
}
|
||
|
||
void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||
auto & hparams = model.hparams;
|
||
const gguf_context * ctx = ml.meta.get();
|
||
|
||
// get metadata as string
|
||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||
if (type == GGUF_TYPE_ARRAY) {
|
||
continue;
|
||
}
|
||
const char * name = gguf_get_key(ctx, i);
|
||
const std::string value = gguf_kv_to_str(ctx, i);
|
||
model.gguf_kv.emplace(name, value);
|
||
}
|
||
|
||
// get general kv
|
||
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
||
|
||
// get hparams kv
|
||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
||
|
||
// everything past this point is not vocab-related
|
||
if (hparams.vocab_only) {
|
||
return;
|
||
}
|
||
|
||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||
|
||
if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||
|
||
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
||
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
||
|
||
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
||
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
||
}
|
||
|
||
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
||
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
||
if (hparams.n_expert > 0) {
|
||
GGML_ASSERT(hparams.n_expert_used > 0);
|
||
} else {
|
||
GGML_ASSERT(hparams.n_expert_used == 0);
|
||
}
|
||
|
||
// zero-out the array hparams
|
||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||
|
||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||
|
||
// n_head_kv is optional, default to n_head
|
||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||
|
||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
|
||
|
||
bool rope_finetuned = false;
|
||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||
hparams.rope_finetuned = rope_finetuned;
|
||
|
||
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
||
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
||
|
||
// rope_freq_base (optional)
|
||
hparams.rope_freq_base_train = 10000.0f;
|
||
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
||
|
||
std::string rope_scaling("linear");
|
||
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
||
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
||
|
||
// rope_freq_scale (inverse of the kv) is optional
|
||
float ropescale = 0.0f;
|
||
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
||
// try the old key name
|
||
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
||
}
|
||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||
|
||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||
|
||
// non-transformer models do not have attention heads
|
||
if (hparams.n_head() > 0) {
|
||
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
||
// gpt-j n_rot = rotary_dim
|
||
|
||
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
|
||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||
|
||
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
|
||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||
|
||
// sanity check for n_rot (optional)
|
||
hparams.n_rot = hparams.n_embd_head_k;
|
||
|
||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||
|
||
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||
}
|
||
}
|
||
} else {
|
||
hparams.n_rot = 0;
|
||
hparams.n_embd_head_k = 0;
|
||
hparams.n_embd_head_v = 0;
|
||
}
|
||
|
||
using e_model = llm_type; // TMP
|
||
|
||
// arch-specific KVs
|
||
switch (model.arch) {
|
||
case LLM_ARCH_LLAMA:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
if (hparams.n_expert == 8) {
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_8x7B; break;
|
||
case 56: model.type = e_model::MODEL_8x22B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} else {
|
||
switch (hparams.n_layer) {
|
||
case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
|
||
case 22: model.type = e_model::MODEL_1B; break;
|
||
case 26: model.type = e_model::MODEL_3B; break;
|
||
case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
|
||
// granite uses a vocab with len 49152
|
||
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
||
case 36: model.type = e_model::MODEL_8B; break; // granite
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
case 48: model.type = e_model::MODEL_34B; break;
|
||
case 60: model.type = e_model::MODEL_30B; break;
|
||
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
}
|
||
} break;
|
||
case LLM_ARCH_DECI:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 80: model.type = e_model::MODEL_70B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_MINICPM:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 52: model.type = e_model::MODEL_1B; break;
|
||
case 40: model.type = e_model::MODEL_2B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_MINICPM3:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 62: model.type = e_model::MODEL_4B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GROK:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 64: model.type = e_model::MODEL_314B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_FALCON:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 60: model.type = e_model::MODEL_40B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_BAICHUAN:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
|
||
if (model.type == e_model::MODEL_13B) {
|
||
// TODO: become GGUF KV parameter
|
||
hparams.f_max_alibi_bias = 8.0f;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_STARCODER:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1B; break;
|
||
case 36: model.type = e_model::MODEL_3B; break;
|
||
case 42: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_15B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_REFACT:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_1B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
|
||
// TODO: become GGUF KV parameter
|
||
hparams.f_max_alibi_bias = 8.0f;
|
||
} break;
|
||
case LLM_ARCH_BERT:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 3:
|
||
model.type = e_model::MODEL_17M; break; // bge-micro
|
||
case 6:
|
||
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
||
case 12:
|
||
switch (hparams.n_embd) {
|
||
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
||
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 24:
|
||
model.type = e_model::MODEL_335M; break; // bge-large
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_JINA_BERT_V2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||
hparams.f_max_alibi_bias = 8.0f;
|
||
|
||
switch (hparams.n_layer) {
|
||
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
||
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_NOMIC_BERT:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||
|
||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||
model.type = e_model::MODEL_137M;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_BLOOM:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1B; break;
|
||
case 30:
|
||
switch (hparams.n_embd) {
|
||
case 2560: model.type = e_model::MODEL_3B; break;
|
||
case 4096: model.type = e_model::MODEL_7B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
|
||
// TODO: become GGUF KV parameter
|
||
hparams.f_max_alibi_bias = 8.0f;
|
||
} break;
|
||
case LLM_ARCH_MPT:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 48: model.type = e_model::MODEL_30B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_STABLELM:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1B; break;
|
||
case 32: model.type = e_model::MODEL_3B; break;
|
||
case 40: model.type = e_model::MODEL_12B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_QWEN:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_QWEN2VL:
|
||
{
|
||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
||
}
|
||
// fall through
|
||
case LLM_ARCH_QWEN2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
||
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 36: model.type = e_model::MODEL_3B; break;
|
||
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
||
case 48: model.type = e_model::MODEL_14B; break;
|
||
case 64: model.type = e_model::MODEL_32B; break;
|
||
case 80: model.type = e_model::MODEL_70B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_QWEN2MOE:
|
||
{
|
||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_A2_7B; break;
|
||
case 28: model.type = e_model::MODEL_57B_A14B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_PHI2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1B; break;
|
||
case 32: model.type = e_model::MODEL_3B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_PHI3:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1B; break;
|
||
case 32: model.type = e_model::MODEL_3B; break;
|
||
case 40: model.type = e_model::MODEL_14B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
|
||
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
||
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
||
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
||
hparams.n_swa = 2047;
|
||
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
||
// default value for Phi-3-mini-128k-instruct
|
||
hparams.n_swa = 262144;
|
||
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
||
// default value for Phi-3-medium-128k-instruct
|
||
hparams.n_swa = 131072;
|
||
}
|
||
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||
if (!found_swa && hparams.n_swa == 0) {
|
||
throw std::runtime_error("invalid value for sliding_window");
|
||
}
|
||
} break;
|
||
case LLM_ARCH_PLAMO:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GPT2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 12: model.type = e_model::MODEL_SMALL; break;
|
||
case 24: model.type = e_model::MODEL_MEDIUM; break;
|
||
case 36: model.type = e_model::MODEL_LARGE; break;
|
||
case 48: model.type = e_model::MODEL_XL; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_CODESHELL:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 42: model.type = e_model::MODEL_7B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_ORION:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 40: model.type = e_model::MODEL_14B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_INTERNLM2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 48: model.type = e_model::MODEL_20B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GEMMA:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 18: model.type = e_model::MODEL_2B; break;
|
||
case 28: model.type = e_model::MODEL_7B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GEMMA2:
|
||
{
|
||
hparams.n_swa = 4096; // default value of gemma 2
|
||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||
hparams.attn_soft_cap = true;
|
||
|
||
switch (hparams.n_layer) {
|
||
case 26: model.type = e_model::MODEL_2B; break;
|
||
case 42: model.type = e_model::MODEL_9B; break;
|
||
case 46: model.type = e_model::MODEL_27B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_STARCODER2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 30: model.type = e_model::MODEL_3B; break;
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_15B; break;
|
||
case 52: model.type = e_model::MODEL_20B; break; // granite
|
||
case 88: model.type = e_model::MODEL_34B; break; // granite
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_MAMBA:
|
||
{
|
||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||
ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
|
||
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24:
|
||
switch (hparams.n_embd) {
|
||
case 768: model.type = e_model::MODEL_SMALL; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 48:
|
||
switch (hparams.n_embd) {
|
||
case 1024: model.type = e_model::MODEL_MEDIUM; break;
|
||
case 1536: model.type = e_model::MODEL_LARGE; break;
|
||
case 2048: model.type = e_model::MODEL_XL; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 64:
|
||
switch (hparams.n_embd) {
|
||
case 2560: model.type = e_model::MODEL_3B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_XVERSE:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
case 80: model.type = e_model::MODEL_65B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_COMMAND_R:
|
||
{
|
||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 40: model.type = e_model::MODEL_35B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_COHERE2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_8B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_DBRX:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 40: model.type = e_model::MODEL_16x12B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_OLMO:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 22: model.type = e_model::MODEL_1B; break;
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 80: model.type = e_model::MODEL_70B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_OLMO2:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 16: model.type = e_model::MODEL_1B; break;
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_OLMOE:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 16: model.type = e_model::MODEL_A1_7B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_OPENELM:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 16: model.type = e_model::MODEL_270M; break;
|
||
case 20: model.type = e_model::MODEL_450M; break;
|
||
case 28: model.type = e_model::MODEL_1B; break;
|
||
case 36: model.type = e_model::MODEL_3B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GPTNEOX:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||
switch (hparams.n_layer) {
|
||
case 6:
|
||
switch (hparams.n_ff()) {
|
||
case 512: model.type = e_model::MODEL_14M; break;
|
||
case 2048: model.type = e_model::MODEL_70M; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 12:
|
||
switch (hparams.n_ff()) {
|
||
case 3072: model.type = e_model::MODEL_160M; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 16:
|
||
switch (hparams.n_ff()) {
|
||
case 8192: model.type = e_model::MODEL_1B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 24:
|
||
switch (hparams.n_ff()) {
|
||
case 4096: model.type = e_model::MODEL_410M; break;
|
||
case 8192: model.type = e_model::MODEL_1_4B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 32:
|
||
switch (hparams.n_ff()) {
|
||
case 10240: model.type = e_model::MODEL_2_8B; break;
|
||
case 16384: model.type = e_model::MODEL_6_9B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 36:
|
||
switch (hparams.n_ff()) {
|
||
case 20480: model.type = e_model::MODEL_12B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 44:
|
||
switch (hparams.n_ff()) {
|
||
case 24576: model.type = e_model::MODEL_20B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_ARCTIC:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
if (hparams.n_expert == 128) {
|
||
switch (hparams.n_layer) {
|
||
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} else {
|
||
model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_DEEPSEEK:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 28: model.type = e_model::MODEL_20B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_DEEPSEEK2:
|
||
{
|
||
bool is_lite = (hparams.n_layer == 27);
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||
if (!is_lite) {
|
||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||
}
|
||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 27: model.type = e_model::MODEL_16B; break;
|
||
case 60: model.type = e_model::MODEL_236B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_CHATGLM:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
switch (hparams.n_layer) {
|
||
case 28: model.type = e_model::MODEL_6B; break;
|
||
case 40: model.type = e_model::MODEL_9B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_BITNET:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 26: model.type = e_model::MODEL_3B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_T5:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||
|
||
uint32_t dec_start_token_id;
|
||
if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
|
||
hparams.dec_start_token_id = dec_start_token_id;
|
||
}
|
||
|
||
switch (hparams.n_layer) {
|
||
case 6: model.type = e_model::MODEL_60M; break; // t5-small
|
||
case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
|
||
case 12:
|
||
switch (hparams.n_ff()) {
|
||
case 3072: model.type = e_model::MODEL_220M; break; // t5-base
|
||
case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 24:
|
||
switch (hparams.n_ff()) {
|
||
case 4096: model.type = e_model::MODEL_770M; break; // t5-large
|
||
case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
|
||
case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
|
||
case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
|
||
case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
|
||
case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_T5ENCODER:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||
model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case LLM_ARCH_JAIS:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1_3B; break;
|
||
case 40: model.type = e_model::MODEL_13B; break;
|
||
/* TODO: add variants */
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_NEMOTRON:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_4B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_EXAONE:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_8B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_RWKV6:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 24: model.type = e_model::MODEL_1_6B; break;
|
||
case 32:
|
||
switch (hparams.n_embd) {
|
||
case 2560: model.type = e_model::MODEL_3B; break;
|
||
case 4096: model.type = e_model::MODEL_7B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
} break;
|
||
case 61: model.type = e_model::MODEL_14B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_GRANITE:
|
||
case LLM_ARCH_GRANITE_MOE:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_3B; break;
|
||
case 40: model.type = e_model::MODEL_3B; break;
|
||
// Add additional layer/vocab/etc checks here for other model sizes
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_CHAMELEON:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
||
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||
|
||
switch (hparams.n_layer) {
|
||
case 32: model.type = e_model::MODEL_7B; break;
|
||
case 48: model.type = e_model::MODEL_34B; break;
|
||
default: model.type = e_model::MODEL_UNKNOWN;
|
||
}
|
||
} break;
|
||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||
{
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||
} break;
|
||
default: throw std::runtime_error("unsupported model architecture");
|
||
}
|
||
|
||
model.ftype = ml.ftype;
|
||
|
||
if (hparams.f_max_alibi_bias > 0.0f) {
|
||
hparams.use_alibi = true;
|
||
}
|
||
|
||
hparams.rope_type = llama_rope_type(&model);
|
||
}
|
||
|
||
void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||
auto & vocab = model.vocab;
|
||
|
||
struct gguf_context * ctx = ml.meta.get();
|
||
|
||
const auto kv = LLM_KV(model.arch);
|
||
|
||
// determine vocab type
|
||
{
|
||
std::string tokenizer_model;
|
||
std::string tokenizer_pre;
|
||
|
||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||
|
||
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||
vocab.linefeed_id = LLAMA_TOKEN_NULL;
|
||
|
||
// read vocab size from metadata
|
||
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
|
||
vocab.n_vocab = 0;
|
||
LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
|
||
}
|
||
return;
|
||
}
|
||
|
||
if (tokenizer_model == "llama") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = 1;
|
||
vocab.special_eos_id = 2;
|
||
vocab.special_unk_id = 0;
|
||
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||
} else if (tokenizer_model == "bert") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_unk_id = 100;
|
||
vocab.special_sep_id = 102;
|
||
vocab.special_pad_id = 0;
|
||
vocab.special_cls_id = 101;
|
||
vocab.special_mask_id = 103;
|
||
} else if (tokenizer_model == "gpt2") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||
|
||
// read bpe merges and populate bpe ranks
|
||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||
if (merges_keyidx == -1) {
|
||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||
}
|
||
|
||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||
for (int i = 0; i < n_merges; i++) {
|
||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||
|
||
std::string first;
|
||
std::string second;
|
||
|
||
const size_t pos = word.find(' ', 1);
|
||
|
||
if (pos != std::string::npos) {
|
||
first = word.substr(0, pos);
|
||
second = word.substr(pos + 1);
|
||
}
|
||
|
||
vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
|
||
}
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = 11;
|
||
vocab.special_eos_id = 11;
|
||
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||
} else if (tokenizer_model == "t5") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_UGM;
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_eos_id = 1;
|
||
vocab.special_unk_id = 2;
|
||
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_pad_id = 0;
|
||
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||
|
||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||
if (precompiled_charsmap_keyidx != -1) {
|
||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||
const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||
vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
|
||
#ifdef IS_BIG_ENDIAN
|
||
// correct endiannes of data in precompiled_charsmap binary blob
|
||
uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
|
||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
|
||
uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
|
||
for (size_t i = 0; i < xcda_array_size; ++i) {
|
||
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
|
||
}
|
||
#endif
|
||
}
|
||
} else if (tokenizer_model == "rwkv") {
|
||
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
||
|
||
// default special tokens
|
||
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||
} else {
|
||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||
}
|
||
|
||
// for now, only BPE models have pre-tokenizers
|
||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||
vocab.tokenizer_add_space_prefix = false;
|
||
vocab.tokenizer_clean_spaces = true;
|
||
if (tokenizer_pre.empty()) {
|
||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
||
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
} else if (tokenizer_pre == "default") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
} else if (
|
||
tokenizer_pre == "llama3" ||
|
||
tokenizer_pre == "llama-v3" ||
|
||
tokenizer_pre == "llama-bpe"||
|
||
tokenizer_pre == "falcon3") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||
vocab.tokenizer_ignore_merges = true;
|
||
vocab.tokenizer_add_bos = true;
|
||
} else if (
|
||
tokenizer_pre == "deepseek-llm") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "deepseek-coder") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "falcon") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||
} else if (
|
||
tokenizer_pre == "mpt") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
||
} else if (
|
||
tokenizer_pre == "starcoder") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
||
} else if (
|
||
tokenizer_pre == "gpt-2" ||
|
||
tokenizer_pre == "phi-2" ||
|
||
tokenizer_pre == "jina-es" ||
|
||
tokenizer_pre == "jina-de" ||
|
||
tokenizer_pre == "gigachat" ||
|
||
tokenizer_pre == "jina-v1-en" ||
|
||
tokenizer_pre == "jina-v2-es" ||
|
||
tokenizer_pre == "jina-v2-de" ||
|
||
tokenizer_pre == "jina-v2-code" ||
|
||
tokenizer_pre == "roberta-bpe") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||
} else if (
|
||
tokenizer_pre == "refact") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||
} else if (
|
||
tokenizer_pre == "command-r") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "qwen2") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "stablelm2") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
||
} else if (
|
||
tokenizer_pre == "olmo") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
||
} else if (
|
||
tokenizer_pre == "dbrx") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||
} else if (
|
||
tokenizer_pre == "smaug-bpe") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||
} else if (
|
||
tokenizer_pre == "poro-chat") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "chatglm-bpe") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
||
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||
} else if (
|
||
tokenizer_pre == "viking") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "jais") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||
} else if (
|
||
tokenizer_pre == "tekken") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
vocab.tokenizer_ignore_merges = true;
|
||
vocab.tokenizer_add_bos = true;
|
||
} else if (
|
||
tokenizer_pre == "smollm") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "codeshell") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||
} else if (
|
||
tokenizer_pre == "bloom") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
||
} else if (
|
||
tokenizer_pre == "gpt3-finnish") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
||
} else if (
|
||
tokenizer_pre == "exaone") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
||
} else if (
|
||
tokenizer_pre == "chameleon") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
||
vocab.tokenizer_add_bos = true;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
} else if (
|
||
tokenizer_pre == "minerva-7b") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||
} else if (
|
||
tokenizer_pre == "megrez") {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||
} else {
|
||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||
}
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
vocab.tokenizer_add_space_prefix = true;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
vocab.tokenizer_add_bos = true;
|
||
vocab.tokenizer_add_eos = false;
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
vocab.tokenizer_add_space_prefix = false;
|
||
vocab.tokenizer_clean_spaces = true;
|
||
vocab.tokenizer_add_bos = true;
|
||
vocab.tokenizer_add_eos = false;
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
vocab.tokenizer_add_bos = false;
|
||
vocab.tokenizer_add_eos = true;
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
vocab.tokenizer_add_space_prefix = false;
|
||
vocab.tokenizer_clean_spaces = false;
|
||
vocab.tokenizer_add_bos = false;
|
||
vocab.tokenizer_add_eos = false;
|
||
} else {
|
||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||
}
|
||
|
||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
|
||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
|
||
}
|
||
|
||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||
if (token_idx == -1) {
|
||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||
}
|
||
|
||
const float * scores = nullptr;
|
||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||
if (score_idx != -1) {
|
||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||
}
|
||
|
||
const int * toktypes = nullptr;
|
||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||
if (toktype_idx != -1) {
|
||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||
}
|
||
|
||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||
|
||
vocab.n_vocab = n_vocab;
|
||
vocab.id_to_token.resize(n_vocab);
|
||
|
||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||
if (word.empty()) {
|
||
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
|
||
word = "[EMPTY_" + std::to_string(i) + "]";
|
||
}
|
||
|
||
vocab.token_to_id[word] = i;
|
||
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
||
|
||
auto & token_data = vocab.id_to_token[i];
|
||
token_data.text = std::move(word);
|
||
token_data.score = scores ? scores[i] : 0.0f;
|
||
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
||
|
||
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
||
switch(toktypes[i]) {
|
||
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
||
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
||
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
||
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
||
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
||
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
||
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||
}
|
||
}
|
||
}
|
||
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
||
|
||
vocab.init_tokenizer();
|
||
|
||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||
try {
|
||
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
|
||
} catch (const std::exception & e) {
|
||
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
||
vocab.linefeed_id = vocab.special_pad_id;
|
||
}
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||
vocab.linefeed_id = vocab.special_pad_id;
|
||
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
|
||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||
vocab.linefeed_id = ids[0];
|
||
} else {
|
||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||
|
||
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||
if (ids.empty()) {
|
||
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
|
||
vocab.linefeed_id = vocab.special_pad_id;
|
||
} else {
|
||
vocab.linefeed_id = ids[0];
|
||
}
|
||
}
|
||
|
||
// special tokens
|
||
{
|
||
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
||
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
||
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
||
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
||
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
||
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
|
||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
|
||
{ LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
|
||
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
|
||
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
|
||
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
|
||
|
||
// deprecated
|
||
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
|
||
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
|
||
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
|
||
};
|
||
|
||
for (const auto & it : special_token_types) {
|
||
const std::string & key = kv(std::get<0>(it));
|
||
int32_t & id = std::get<1>(it);
|
||
|
||
uint32_t new_id;
|
||
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
||
continue;
|
||
}
|
||
if (new_id >= vocab.id_to_token.size()) {
|
||
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
|
||
__func__, key.c_str(), new_id, id);
|
||
} else {
|
||
id = new_id;
|
||
}
|
||
}
|
||
|
||
// Handle add_bos_token and add_eos_token
|
||
{
|
||
bool temp = true;
|
||
|
||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
||
vocab.tokenizer_add_bos = temp;
|
||
}
|
||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
||
vocab.tokenizer_add_eos = temp;
|
||
}
|
||
}
|
||
|
||
// auto-detect special tokens by text
|
||
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
|
||
// for now, we apply this workaround to find the tokens based on their text
|
||
|
||
for (const auto & t : vocab.token_to_id) {
|
||
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
||
if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|eot_id|>"
|
||
|| t.first == "<|im_end|>"
|
||
|| t.first == "<|end|>"
|
||
|| t.first == "<end_of_turn>"
|
||
|| t.first == "<|endoftext|>"
|
||
|| t.first == "<EOT>"
|
||
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
||
) {
|
||
vocab.special_eot_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find EOM token: "<|eom_id|>"
|
||
if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|eom_id|>"
|
||
) {
|
||
vocab.special_eom_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
|
||
if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|fim_prefix|>" // Qwen
|
||
|| t.first == "<fim-prefix>"
|
||
|| t.first == "<|fim▁begin|>" // DeepSeek
|
||
|| t.first == "<PRE>"
|
||
) {
|
||
vocab.special_fim_pre_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
|
||
if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|fim_suffix|>" // Qwen
|
||
|| t.first == "<fim-suffix>"
|
||
|| t.first == "<|fim▁hole|>" // DeepSeek
|
||
|| t.first == "<SUF>"
|
||
) {
|
||
vocab.special_fim_suf_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
|
||
if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|fim_middle|>" // Qwen
|
||
|| t.first == "<fim-middle>"
|
||
|| t.first == "<|fim▁end|>" // DeepSeek
|
||
|| t.first == "<MID>"
|
||
) {
|
||
vocab.special_fim_mid_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
|
||
if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|fim_pad|>" // Qwen
|
||
|| t.first == "<fim-pad>"
|
||
|| t.first == "<PAD>"
|
||
) {
|
||
vocab.special_fim_pad_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
|
||
if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|fim_repo|>" // Qwen
|
||
|| t.first == "<|repo_name|>"
|
||
|| t.first == "<fim-repo>"
|
||
|| t.first == "<REPO>"
|
||
) {
|
||
vocab.special_fim_rep_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
|
||
// find FIM_SEP token: "<|file_sep|>"
|
||
if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
|
||
if (false
|
||
|| t.first == "<|file_sep|>" // Qwen
|
||
) {
|
||
vocab.special_fim_sep_id = t.second;
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// maintain a list of tokens that cause end-of-generation
|
||
// this is currently determined based on the token text, which is obviously not ideal
|
||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||
vocab.special_eog_ids.clear();
|
||
|
||
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
|
||
}
|
||
|
||
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
|
||
}
|
||
|
||
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
|
||
}
|
||
|
||
for (const auto & t : vocab.token_to_id) {
|
||
if (false
|
||
|| t.first == "<|eot_id|>"
|
||
|| t.first == "<|im_end|>"
|
||
|| t.first == "<|end|>"
|
||
|| t.first == "<end_of_turn>"
|
||
|| t.first == "<|endoftext|>"
|
||
|| t.first == "<|eom_id|>"
|
||
|| t.first == "<EOT>"
|
||
) {
|
||
vocab.special_eog_ids.insert(t.second);
|
||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||
__func__, t.second, t.first.c_str());
|
||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||
}
|
||
} else {
|
||
// token is control, but not marked as EOG -> print a debug log
|
||
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
|
||
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
||
__func__, t.second, t.first.c_str());
|
||
}
|
||
}
|
||
}
|
||
|
||
// sanity checks
|
||
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
||
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||
}
|
||
|
||
if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_eot_id);
|
||
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||
}
|
||
|
||
if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
|
||
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
||
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||
}
|
||
}
|
||
|
||
// build special tokens cache
|
||
{
|
||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
|
||
vocab.cache_special_tokens.push_back(id);
|
||
}
|
||
}
|
||
|
||
std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
||
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
||
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
||
}
|
||
);
|
||
|
||
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
||
}
|
||
|
||
// build token to piece cache
|
||
{
|
||
size_t size_cache = 0;
|
||
|
||
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
||
|
||
for (uint32_t id = 0; id < n_vocab; ++id) {
|
||
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
||
|
||
size_cache += cache_token_to_piece[id].size();
|
||
}
|
||
|
||
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
||
|
||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||
}
|
||
|
||
// Handle per token attributes
|
||
//NOTE: Each model customizes per token attributes.
|
||
//NOTE: Per token attributes are missing from the GGUF file.
|
||
//TODO: Extract attributes from GGUF file.
|
||
{
|
||
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
||
for (auto substr : substrs) {
|
||
if (str.find(substr) < std::string::npos) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
};
|
||
|
||
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
||
uint32_t current = vocab.id_to_token.at(id).attr;
|
||
current = value ? (current | attr) : (current & ~attr);
|
||
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
||
};
|
||
|
||
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
||
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
||
};
|
||
|
||
std::string model_name;
|
||
std::string tokenizer_pre;
|
||
|
||
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||
|
||
// model name to lowercase
|
||
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
||
[] (const std::string::value_type x) {
|
||
return std::tolower(x);
|
||
}
|
||
);
|
||
|
||
// set attributes by model/tokenizer name
|
||
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
||
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
||
for (auto id : vocab.cache_special_tokens) {
|
||
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||
}
|
||
for (auto token : {"</s>"}) {
|
||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||
}
|
||
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||
const auto & hparams = model.hparams;
|
||
const auto & vocab = model.vocab;
|
||
|
||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||
|
||
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
||
bool is_var = false;
|
||
|
||
std::vector<uint32_t> v;
|
||
for (uint32_t i = 0; i < n; ++i) {
|
||
v.push_back(f(i));
|
||
if (v[i] != v[0]) {
|
||
is_var = true;
|
||
}
|
||
}
|
||
|
||
std::stringstream ss;
|
||
|
||
if (is_var) {
|
||
ss << "[";
|
||
for (uint32_t i = 0; i < n; ++i) {
|
||
ss << v[i];
|
||
if (i < n - 1) {
|
||
ss << ", ";
|
||
}
|
||
}
|
||
ss << "]";
|
||
} else {
|
||
ss << v[0];
|
||
}
|
||
|
||
return ss.str();
|
||
};
|
||
|
||
// hparams
|
||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
|
||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||
|
||
if (!hparams.vocab_only) {
|
||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
||
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
||
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
||
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
||
}
|
||
|
||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
|
||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
|
||
if (ml.n_elements >= 1e12) {
|
||
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
||
} else if (ml.n_elements >= 1e9) {
|
||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||
} else if (ml.n_elements >= 1e6) {
|
||
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
||
} else {
|
||
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
||
}
|
||
if (ml.n_bytes < GiB) {
|
||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||
} else {
|
||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||
}
|
||
|
||
// general kv
|
||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||
|
||
// special tokens
|
||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||
|
||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||
|
||
if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
||
if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
||
if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
||
if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
||
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
||
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
||
|
||
for (const auto & id : vocab.special_eog_ids) {
|
||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
||
}
|
||
|
||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
||
|
||
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||
}
|
||
|
||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||
}
|
||
|
||
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||
}
|
||
|
||
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||
}
|
||
}
|
||
|
||
//
|
||
// interface implementation
|
||
//
|
||
|
||
struct llama_model_params llama_model_default_params() {
|
||
struct llama_model_params result = {
|
||
/*.devices =*/ nullptr,
|
||
/*.n_gpu_layers =*/ 0,
|
||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||
/*.main_gpu =*/ 0,
|
||
/*.tensor_split =*/ nullptr,
|
||
/*.rpc_servers =*/ nullptr,
|
||
/*.progress_callback =*/ nullptr,
|
||
/*.progress_callback_user_data =*/ nullptr,
|
||
/*.kv_overrides =*/ nullptr,
|
||
/*.vocab_only =*/ false,
|
||
/*.use_mmap =*/ true,
|
||
/*.use_mlock =*/ false,
|
||
/*.check_tensors =*/ false,
|
||
};
|
||
|
||
#ifdef GGML_USE_METAL
|
||
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
||
result.n_gpu_layers = 999;
|
||
#endif
|
||
|
||
return result;
|
||
}
|
||
|
||
void llama_free_model(struct llama_model * model) {
|
||
delete model;
|
||
}
|
||
|
||
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
||
return model->vocab.type;
|
||
}
|
||
|
||
int32_t llama_n_vocab(const struct llama_model * model) {
|
||
return model->hparams.n_vocab;
|
||
}
|
||
|
||
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
||
return model->hparams.n_ctx_train;
|
||
}
|
||
|
||
int32_t llama_n_embd(const struct llama_model * model) {
|
||
return model->hparams.n_embd;
|
||
}
|
||
|
||
int32_t llama_n_layer(const struct llama_model * model) {
|
||
return model->hparams.n_layer;
|
||
}
|
||
|
||
int32_t llama_n_head(const struct llama_model * model) {
|
||
return model->hparams.n_head();
|
||
}
|
||
|
||
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||
switch (model->arch) {
|
||
// these models do not use RoPE
|
||
case LLM_ARCH_GPT2:
|
||
case LLM_ARCH_GPTJ:
|
||
case LLM_ARCH_MPT:
|
||
case LLM_ARCH_REFACT:
|
||
case LLM_ARCH_BLOOM:
|
||
case LLM_ARCH_MAMBA:
|
||
case LLM_ARCH_JINA_BERT_V2:
|
||
case LLM_ARCH_T5:
|
||
case LLM_ARCH_T5ENCODER:
|
||
case LLM_ARCH_JAIS:
|
||
case LLM_ARCH_RWKV6:
|
||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||
return LLAMA_ROPE_TYPE_NONE;
|
||
|
||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||
case LLM_ARCH_LLAMA:
|
||
case LLM_ARCH_DECI:
|
||
case LLM_ARCH_BAICHUAN:
|
||
case LLM_ARCH_STARCODER:
|
||
case LLM_ARCH_PLAMO:
|
||
case LLM_ARCH_ORION:
|
||
case LLM_ARCH_INTERNLM2:
|
||
case LLM_ARCH_MINICPM:
|
||
case LLM_ARCH_XVERSE:
|
||
case LLM_ARCH_COMMAND_R:
|
||
case LLM_ARCH_COHERE2:
|
||
case LLM_ARCH_OLMO:
|
||
case LLM_ARCH_ARCTIC:
|
||
case LLM_ARCH_DEEPSEEK:
|
||
case LLM_ARCH_DEEPSEEK2:
|
||
case LLM_ARCH_CHATGLM:
|
||
case LLM_ARCH_GRANITE:
|
||
case LLM_ARCH_GRANITE_MOE:
|
||
case LLM_ARCH_CHAMELEON:
|
||
return LLAMA_ROPE_TYPE_NORM;
|
||
|
||
// the pairs of head values are offset by n_rot/2
|
||
case LLM_ARCH_FALCON:
|
||
case LLM_ARCH_GROK:
|
||
case LLM_ARCH_DBRX:
|
||
case LLM_ARCH_BERT:
|
||
case LLM_ARCH_NOMIC_BERT:
|
||
case LLM_ARCH_STABLELM:
|
||
case LLM_ARCH_BITNET:
|
||
case LLM_ARCH_QWEN:
|
||
case LLM_ARCH_QWEN2:
|
||
case LLM_ARCH_QWEN2MOE:
|
||
case LLM_ARCH_OLMO2:
|
||
case LLM_ARCH_OLMOE:
|
||
case LLM_ARCH_PHI2:
|
||
case LLM_ARCH_PHI3:
|
||
case LLM_ARCH_GEMMA:
|
||
case LLM_ARCH_GEMMA2:
|
||
case LLM_ARCH_STARCODER2:
|
||
case LLM_ARCH_OPENELM:
|
||
case LLM_ARCH_GPTNEOX:
|
||
case LLM_ARCH_CODESHELL:
|
||
case LLM_ARCH_NEMOTRON:
|
||
case LLM_ARCH_EXAONE:
|
||
case LLM_ARCH_MINICPM3:
|
||
return LLAMA_ROPE_TYPE_NEOX;
|
||
|
||
case LLM_ARCH_QWEN2VL:
|
||
return LLAMA_ROPE_TYPE_MROPE;
|
||
|
||
// all model arches should be listed explicitly here
|
||
case LLM_ARCH_UNKNOWN:
|
||
GGML_ABORT("unknown architecture");
|
||
}
|
||
|
||
return LLAMA_ROPE_TYPE_NONE;
|
||
}
|
||
|
||
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||
return model->hparams.rope_freq_scale_train;
|
||
}
|
||
|
||
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||
const auto & it = model->gguf_kv.find(key);
|
||
if (it == model->gguf_kv.end()) {
|
||
if (buf_size > 0) {
|
||
buf[0] = '\0';
|
||
}
|
||
return -1;
|
||
}
|
||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||
}
|
||
|
||
int32_t llama_model_meta_count(const struct llama_model * model) {
|
||
return (int)model->gguf_kv.size();
|
||
}
|
||
|
||
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||
if (buf_size > 0) {
|
||
buf[0] = '\0';
|
||
}
|
||
return -1;
|
||
}
|
||
auto it = model->gguf_kv.begin();
|
||
std::advance(it, i);
|
||
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||
}
|
||
|
||
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
|
||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||
if (buf_size > 0) {
|
||
buf[0] = '\0';
|
||
}
|
||
return -1;
|
||
}
|
||
auto it = model->gguf_kv.begin();
|
||
std::advance(it, i);
|
||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||
}
|
||
|
||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||
return snprintf(buf, buf_size, "%s %s %s",
|
||
llama_model_arch_name (*model).c_str(),
|
||
llama_model_type_name (*model).c_str(),
|
||
llama_model_ftype_name(*model).c_str());
|
||
}
|
||
|
||
uint64_t llama_model_size(const struct llama_model * model) {
|
||
return model->n_bytes;
|
||
}
|
||
|
||
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||
return model->n_elements;
|
||
}
|
||
|
||
bool llama_model_has_encoder(const struct llama_model * model) {
|
||
switch (model->arch) {
|
||
case LLM_ARCH_T5: return true;
|
||
case LLM_ARCH_T5ENCODER: return true;
|
||
default: return false;
|
||
}
|
||
}
|
||
|
||
bool llama_model_has_decoder(const struct llama_model * model) {
|
||
switch (model->arch) {
|
||
case LLM_ARCH_T5ENCODER: return false;
|
||
default: return true;
|
||
}
|
||
}
|
||
|
||
llama_token llama_model_decoder_start_token(const struct llama_model * model) {
|
||
return model->hparams.dec_start_token_id;
|
||
}
|
||
|
||
bool llama_model_is_recurrent(const struct llama_model * model) {
|
||
switch (model->arch) {
|
||
case LLM_ARCH_MAMBA: return true;
|
||
case LLM_ARCH_RWKV6: return true;
|
||
default: return false;
|
||
}
|
||
}
|