From ca0908559308553472f56ae58d7783ef0442cae6 Mon Sep 17 00:00:00 2001
From: Eddie-Wang <wangjinheng1120@163.com>
Date: Sun, 9 Jun 2024 02:43:38 +0000
Subject: [PATCH] move i2s to quantize v1

---
 convert-hf-to-gguf.py          | 19 ++++++++++++-------
 examples/quantize/quantize.cpp |  1 +
 ggml-quants.c                  | 27 +++++++++++++++++++++++++++
 ggml-quants.h                  |  1 +
 ggml.c                         |  6 ++++--
 llama.cpp                      |  6 +-----
 6 files changed, 46 insertions(+), 14 deletions(-)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ea993d720..735630b9c 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1418,6 +1418,10 @@ class BitnetModel(Model):
         dtype = weight.dtype
         weight = weight.float()
         s =  1 / weight.abs().mean().clamp(min=1e-5)
+        # from gguf.lazy import LazyNumpyTensor
+        # np_s = LazyNumpyTensor.to_eager(s.numpy())
+        
+        # print(np_s)
         result = (weight * s).round().clamp(-1, 1) / s
         return result.type(dtype)
 
@@ -1444,14 +1448,15 @@ class BitnetModel(Model):
         scale = np.tile(scale, 8)
         return ans, scale
 
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # quant weight to i2 (in fp16)
-        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
-                          "down_proj.weight", "up_proj.weight", "gate_proj.weight",
-                          "o_proj.weight")):
-            data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach()
+    # def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+    #     # quant weight to i2 (in fp16)
+    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+    #                       "down_proj.weight", "up_proj.weight", "gate_proj.weight",
+    #                       "o_proj.weight")):
+    #         print(name)
+    #         data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach()
 
-        return [(self.map_tensor_name(name), data_torch)]
+    #     return [(self.map_tensor_name(name), data_torch)]
 
     def write_tensors(self):
         max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 28584e14b..bc2cc2435 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
     { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
     { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
+    { "I2_S",   LLAMA_FTYPE_MOSTLY_I2,     " 2    bpw per-tensor",              },
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
diff --git a/ggml-quants.c b/ggml-quants.c
index 1353671cc..96d3c88f6 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3306,6 +3306,33 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    // 2 bits per weight
+    size_t row_size = ggml_row_size(GGML_TYPE_I2, n_per_row) / 4;
+    char * qrow = (char *)dst;
+    printf("n_row:%d\n", nrow);
+    printf("n_per_row:%d\n", n_per_row);
+    int n = nrow * n_per_row;
+    float accu = 0.0;
+    float min = 0.00001;
+    for (int i = 0; i < n; ++i) {
+        accu += fabs(src[i]);
+    }
+    accu = accu > min ? accu : min;
+    float scale = n / accu;
+
+    printf("\nscale:%f\n", scale);
+
+    // for (int64_t row = 0; row < nrow; ++row) {
+    //     quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+    //     src += n_per_row;
+    //     qrow += row_size;
+    // }
+
+    // 32B for scale
+    return nrow * row_size + 32;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
diff --git a/ggml-quants.h b/ggml-quants.h
index 1c8e3839d..fea0b41ad 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -122,6 +122,7 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_i2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
diff --git a/ggml.c b/ggml.c
index 06aa601b2..378042537 100644
--- a/ggml.c
+++ b/ggml.c
@@ -573,7 +573,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .type_name                = "i2",
         .blck_size                = 1,
         .type_size                = sizeof(int8_t),
-        .is_quantized             = false,
+        .is_quantized             = true,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_i2_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
@@ -2637,6 +2637,7 @@ inline static void ggml_vec_absmaxclamp_f32(const int n, float * s, float * x, f
     }
     *s = max;
 }
+
 inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const float * x, float scale, float min, float max) {
     for (int i = 0; i < n; ++i) {
         s[i] = round(x[i] * scale);
@@ -2645,6 +2646,7 @@ inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const fl
         s[i] /= scale;
     }
 }
+
 inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_t* inp, float scale, float min, float max) {
     float temp;
     for (int i = 0; i < n; ++i) {
@@ -2653,7 +2655,6 @@ inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_
         if (temp < min) temp = min;
         inp[i] = (int8_t)(temp);
     }
-
 }
 
 //
@@ -21726,6 +21727,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_I2:      result = quantize_i2_s   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/llama.cpp b/llama.cpp
index 4db25c45e..109ac4034 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -15634,6 +15634,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
         case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
         case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+        case LLAMA_FTYPE_MOSTLY_I2:   default_type = GGML_TYPE_I2;   break;
 
         // K-quants
         case LLAMA_FTYPE_MOSTLY_Q2_K_S:
@@ -15658,7 +15659,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_I2  :    default_type = GGML_TYPE_I2;      break;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -15896,10 +15896,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }
-            if (tensor->type == 31) {
-                // no need quantize for i2
-                new_type = tensor->type;
-            }
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             quantize = tensor->type != new_type;