From 3698f79e6a29de1712ee8190f0908175be8664b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Mon, 3 Apr 2023 03:02:26 +0200 Subject: [PATCH] Use full range for q4_0 quantization By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. quantize-stats output: before(7B): q4_0 : mse 0.00000492, maxerr 0.14257812 after(7B): q4_0 : mse 0.00000386, maxerr 0.18200684 (Most layers have reduced maxerr under this rule, but the total max error is indeed slightly higher) --- ggml.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 281b20283..bdc3b8f42 100644 --- a/ggml.c +++ b/ggml.c @@ -680,13 +680,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max + float max = 0.0f; for (int l = 0; l < QK4_0; l++) { const float v = x[i*QK4_0 + l]; - amax = MAX(amax, fabsf(v)); + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - const float d = amax / ((1 << 3) - 1); + const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; y[i].d = d; @@ -695,8 +699,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const float v0 = x[i*QK4_0 + l + 0]*id; const float v1 = x[i*QK4_0 + l + 1]*id; - const uint8_t vi0 = (int8_t)roundf(v0) + 8; - const uint8_t vi1 = (int8_t)roundf(v1) + 8; + const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8); + const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8); assert(vi0 < 16); assert(vi1 < 16);