mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-04 01:57:53 +01:00
llama : fix Jamba quantization sanity checks
This commit is contained in:
parent
fc59407efe
commit
181dadf294
17
llama.cpp
17
llama.cpp
@ -16290,11 +16290,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
// sanity checks
|
// sanity checks
|
||||||
//
|
{
|
||||||
// - qs.n_attention_wv == 0 for Mamba models
|
const auto & n_head_kv_vec = model.hparams.n_head_kv_vec;
|
||||||
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
int n_attn_layer;
|
||||||
//
|
if (model.hparams.n_head_kv == 0) {
|
||||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
// Mamba models don't have attention layers
|
||||||
|
n_attn_layer = 0;
|
||||||
|
} else {
|
||||||
|
// Transformers and hybrid models (like Jamba) have attention layers
|
||||||
|
n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_vec.begin(), n_head_kv_vec.end(), 0);
|
||||||
|
}
|
||||||
|
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||||
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user