diff --git a/ggml-quants.c b/ggml-quants.c index 273cdee70..72149d4a0 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -664,13 +664,13 @@ void quantize_row_i8_s(const float * x, void * y, int64_t n, float* act_scales) double min = 0.00001; double max = min; for (int i = 0; i < n; ++i) { - max = MAX(max, (double)fabs(x[i])); + max = MAX(max, (double)fabs((double)x[i])); } float s = 127 / max; act_scales[0] = s; float temp; for (int i = 0; i < n; ++i) { - temp = round(x[i] * s); + temp = round((double)(x[i] * s)); if (temp > 127) temp = 127; if (temp < -128) temp = -128; dst[i] = (int8_t)(temp); @@ -3335,14 +3335,14 @@ size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nr // f32 -> q8 double i2_scale = 0; for (int i=0; i 1e-6) { - i2_scale = src[i]; + if (fabs((double)(src[i])) > 1e-6) { + i2_scale = (double)src[i]; } } uint8_t* q8 = (uint8_t*)dst; for (int i=0; i