From ca0908559308553472f56ae58d7783ef0442cae6 Mon Sep 17 00:00:00 2001 From: Eddie-Wang Date: Sun, 9 Jun 2024 02:43:38 +0000 Subject: [PATCH] move i2s to quantize v1 --- convert-hf-to-gguf.py | 19 ++++++++++++------- examples/quantize/quantize.cpp | 1 + ggml-quants.c | 27 +++++++++++++++++++++++++++ ggml-quants.h | 1 + ggml.c | 6 ++++-- llama.cpp | 6 +----- 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ea993d720..735630b9c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1418,6 +1418,10 @@ class BitnetModel(Model): dtype = weight.dtype weight = weight.float() s = 1 / weight.abs().mean().clamp(min=1e-5) + # from gguf.lazy import LazyNumpyTensor + # np_s = LazyNumpyTensor.to_eager(s.numpy()) + + # print(np_s) result = (weight * s).round().clamp(-1, 1) / s return result.type(dtype) @@ -1444,14 +1448,15 @@ class BitnetModel(Model): scale = np.tile(scale, 8) return ans, scale - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # quant weight to i2 (in fp16) - if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", - "down_proj.weight", "up_proj.weight", "gate_proj.weight", - "o_proj.weight")): - data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach() + # def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # # quant weight to i2 (in fp16) + # if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", + # "down_proj.weight", "up_proj.weight", "gate_proj.weight", + # "o_proj.weight")): + # print(name) + # data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach() - return [(self.map_tensor_name(name), data_torch)] + # return [(self.map_tensor_name(name), data_torch)] def write_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 28584e14b..bc2cc2435 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,6 +26,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "I2_S", LLAMA_FTYPE_MOSTLY_I2, " 2 bpw per-tensor", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, diff --git a/ggml-quants.c b/ggml-quants.c index 1353671cc..96d3c88f6 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3306,6 +3306,33 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } +size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + // 2 bits per weight + size_t row_size = ggml_row_size(GGML_TYPE_I2, n_per_row) / 4; + char * qrow = (char *)dst; + printf("n_row:%d\n", nrow); + printf("n_per_row:%d\n", n_per_row); + int n = nrow * n_per_row; + float accu = 0.0; + float min = 0.00001; + for (int i = 0; i < n; ++i) { + accu += fabs(src[i]); + } + accu = accu > min ? accu : min; + float scale = n / accu; + + printf("\nscale:%f\n", scale); + + // for (int64_t row = 0; row < nrow; ++row) { + // quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights); + // src += n_per_row; + // qrow += row_size; + // } + + // 32B for scale + return nrow * row_size + 32; +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) { diff --git a/ggml-quants.h b/ggml-quants.h index 1c8e3839d..fea0b41ad 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -122,6 +122,7 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_i2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml.c b/ggml.c index 06aa601b2..378042537 100644 --- a/ggml.c +++ b/ggml.c @@ -573,7 +573,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_name = "i2", .blck_size = 1, .type_size = sizeof(int8_t), - .is_quantized = false, + .is_quantized = true, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_i2_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, @@ -2637,6 +2637,7 @@ inline static void ggml_vec_absmaxclamp_f32(const int n, float * s, float * x, f } *s = max; } + inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const float * x, float scale, float min, float max) { for (int i = 0; i < n; ++i) { s[i] = round(x[i] * scale); @@ -2645,6 +2646,7 @@ inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const fl s[i] /= scale; } } + inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_t* inp, float scale, float min, float max) { float temp; for (int i = 0; i < n; ++i) { @@ -2653,7 +2655,6 @@ inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_ if (temp < min) temp = min; inp[i] = (int8_t)(temp); } - } // @@ -21726,6 +21727,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_I2: result = quantize_i2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/llama.cpp b/llama.cpp index 4db25c45e..109ac4034 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15634,6 +15634,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; + case LLAMA_FTYPE_MOSTLY_I2: default_type = GGML_TYPE_I2; break; // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: @@ -15658,7 +15659,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_I2 : default_type = GGML_TYPE_I2; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -15896,10 +15896,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } - if (tensor->type == 31) { - // no need quantize for i2 - new_type = tensor->type; - } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type;