From f7d05095b404b5500b4a702ea16f67fc22446e49 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 19 Apr 2023 20:20:14 +0200 Subject: [PATCH] Q4_2 quantization with rmse-optimized scale and quants (#1062) * Q4_2 quantization with rmse-optimized scale and quants For quantize-stats we get q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012 For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks. Quantization is slow (~90 seconds on my Mac for 7B) as not multi-threaded as in PR #896. * ggml : satisfy the sanitizer builds Not sure why this makes them fail * Better follow ggml conventions for function names * Fixed type as per reviewer comment --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- ggml.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 3b38eaad3..431cdb9c9 100644 --- a/ggml.c +++ b/ggml.c @@ -19,6 +19,7 @@ #include #include #include +#include // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 @@ -1135,12 +1136,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } +static inline int nearest_int(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i