llama : quantization-related fixes for T5

2025-01-13 13:52:22 +01:00 · 2024-06-29 18:09:22 +02:00 · 2024-06-29 18:09:22 +02:00 · 6dc9eb4040
commit 6dc9eb4040
parent 7d7fff4654
1 changed files with 7 additions and 3 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17195,10 +17195,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    // sanity checks
    //
-    //  - qs.n_attention_wv == 0                     for Mamba       models
-    //  - qs.n_attention_wv == model.hparams.n_layer for Transformer models
+    //  - qs.n_attention_wv == 0                         for Mamba           models
+    //  - qs.n_attention_wv == model.hparams.n_layer     for Transformer     models
+    //  - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
    //
-    GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
+    GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");

    size_t total_size_org = 0;
    size_t total_size_new = 0;
@ -17323,6 +17324,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("ssm_x.weight")      == std::string::npos;
        quantize &= name.find("ssm_dt.weight")     == std::string::npos;

+        // do not quantize relative position bias (T5)
+        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
        enum ggml_type new_type;
        void * new_data;
        size_t new_size;