mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 08:49:00 +01:00
llama : fix Jamba quantization sanity checks
This commit is contained in:
parent
fc59407efe
commit
181dadf294
17
llama.cpp
17
llama.cpp
@ -16290,11 +16290,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks
|
||||
//
|
||||
// - qs.n_attention_wv == 0 for Mamba models
|
||||
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
||||
//
|
||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
||||
{
|
||||
const auto & n_head_kv_vec = model.hparams.n_head_kv_vec;
|
||||
int n_attn_layer;
|
||||
if (model.hparams.n_head_kv == 0) {
|
||||
// Mamba models don't have attention layers
|
||||
n_attn_layer = 0;
|
||||
} else {
|
||||
// Transformers and hybrid models (like Jamba) have attention layers
|
||||
n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_vec.begin(), n_head_kv_vec.end(), 0);
|
||||
}
|
||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user