diff --git a/src/llama.cpp b/src/llama.cpp index de0caf755..7cdd27d86 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17195,10 +17195,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // sanity checks // - // - qs.n_attention_wv == 0 for Mamba models - // - qs.n_attention_wv == model.hparams.n_layer for Transformer models + // - qs.n_attention_wv == 0 for Mamba models + // - qs.n_attention_wv == model.hparams.n_layer for Transformer models + // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models // - GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected"); size_t total_size_org = 0; size_t total_size_new = 0; @@ -17323,6 +17324,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("ssm_x.weight") == std::string::npos; quantize &= name.find("ssm_dt.weight") == std::string::npos; + // do not quantize relative position bias (T5) + quantize &= name.find("attn_rel_b.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;