mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 05:42:22 +01:00
move i2s to quantize v1
This commit is contained in:
parent
4e1ab50628
commit
ca09085593
@ -1418,6 +1418,10 @@ class BitnetModel(Model):
|
|||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
# from gguf.lazy import LazyNumpyTensor
|
||||||
|
# np_s = LazyNumpyTensor.to_eager(s.numpy())
|
||||||
|
|
||||||
|
# print(np_s)
|
||||||
result = (weight * s).round().clamp(-1, 1) / s
|
result = (weight * s).round().clamp(-1, 1) / s
|
||||||
return result.type(dtype)
|
return result.type(dtype)
|
||||||
|
|
||||||
@ -1444,14 +1448,15 @@ class BitnetModel(Model):
|
|||||||
scale = np.tile(scale, 8)
|
scale = np.tile(scale, 8)
|
||||||
return ans, scale
|
return ans, scale
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
# def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
# quant weight to i2 (in fp16)
|
# # quant weight to i2 (in fp16)
|
||||||
if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
|
# if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
|
||||||
"down_proj.weight", "up_proj.weight", "gate_proj.weight",
|
# "down_proj.weight", "up_proj.weight", "gate_proj.weight",
|
||||||
"o_proj.weight")):
|
# "o_proj.weight")):
|
||||||
data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach()
|
# print(name)
|
||||||
|
# data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach()
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
# return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||||
|
@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
|
{ "I2_S", LLAMA_FTYPE_MOSTLY_I2, " 2 bpw per-tensor", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||||
|
@ -3306,6 +3306,33 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
|||||||
return nrow * row_size;
|
return nrow * row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
|
// 2 bits per weight
|
||||||
|
size_t row_size = ggml_row_size(GGML_TYPE_I2, n_per_row) / 4;
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
printf("n_row:%d\n", nrow);
|
||||||
|
printf("n_per_row:%d\n", n_per_row);
|
||||||
|
int n = nrow * n_per_row;
|
||||||
|
float accu = 0.0;
|
||||||
|
float min = 0.00001;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
accu += fabs(src[i]);
|
||||||
|
}
|
||||||
|
accu = accu > min ? accu : min;
|
||||||
|
float scale = n / accu;
|
||||||
|
|
||||||
|
printf("\nscale:%f\n", scale);
|
||||||
|
|
||||||
|
// for (int64_t row = 0; row < nrow; ++row) {
|
||||||
|
// quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
||||||
|
// src += n_per_row;
|
||||||
|
// qrow += row_size;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// 32B for scale
|
||||||
|
return nrow * row_size + 32;
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== "True" 2-bit (de)-quantization
|
// ====================== "True" 2-bit (de)-quantization
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
||||||
|
@ -122,6 +122,7 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
|||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
size_t quantize_i2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
|
6
ggml.c
6
ggml.c
@ -573,7 +573,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||||||
.type_name = "i2",
|
.type_name = "i2",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(int8_t),
|
.type_size = sizeof(int8_t),
|
||||||
.is_quantized = false,
|
.is_quantized = true,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_i2_q8_0,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_i2_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
@ -2637,6 +2637,7 @@ inline static void ggml_vec_absmaxclamp_f32(const int n, float * s, float * x, f
|
|||||||
}
|
}
|
||||||
*s = max;
|
*s = max;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const float * x, float scale, float min, float max) {
|
inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const float * x, float scale, float min, float max) {
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
s[i] = round(x[i] * scale);
|
s[i] = round(x[i] * scale);
|
||||||
@ -2645,6 +2646,7 @@ inline static void ggml_vec_scaleroundclamp_f32(const int n, float * s, const fl
|
|||||||
s[i] /= scale;
|
s[i] /= scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_t* inp, float scale, float min, float max) {
|
inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_t* inp, float scale, float min, float max) {
|
||||||
float temp;
|
float temp;
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
@ -2653,7 +2655,6 @@ inline static void ggml_vec_scaleroundclamp_f32_v2(const int n, float * s, int8_
|
|||||||
if (temp < min) temp = min;
|
if (temp < min) temp = min;
|
||||||
inp[i] = (int8_t)(temp);
|
inp[i] = (int8_t)(temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -21726,6 +21727,7 @@ size_t ggml_quantize_chunk(
|
|||||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
case GGML_TYPE_I2: result = quantize_i2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
@ -15634,6 +15634,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
||||||
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_I2: default_type = GGML_TYPE_I2; break;
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
||||||
@ -15658,7 +15659,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_I2 : default_type = GGML_TYPE_I2; break;
|
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
@ -15896,10 +15896,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
||||||
new_type = params->output_tensor_type;
|
new_type = params->output_tensor_type;
|
||||||
}
|
}
|
||||||
if (tensor->type == 31) {
|
|
||||||
// no need quantize for i2
|
|
||||||
new_type = tensor->type;
|
|
||||||
}
|
|
||||||
// If we've decided to quantize to the same type the tensor is already
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
// in then there's nothing to do.
|
// in then there's nothing to do.
|
||||||
quantize = tensor->type != new_type;
|
quantize = tensor->type != new_type;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user