Deduplicate q4 quantization functions (#383)

* Deduplicate q4 quantization functions

* Use const; add basic test

* Re-enable quantization test

* Disable AVX2 flags in CI

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Stephan Walter 2023-03-22 17:29:06 +00:00 committed by GitHub
parent 97940520e8
commit 69c92298a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 119 additions and 113 deletions

View File

@ -89,7 +89,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. cmake -DLLAMA_AVX2=OFF ..
cmake --build . --config Release cmake --build . --config Release
ctest --output-on-failure ctest --output-on-failure

155
ggml.c
View File

@ -403,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes )
// method 5 // method 5
// blocks of QK elements // blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2;
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
uint8_t pp[QK/2];
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *)pd = d;
pd += bs;
for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, sizeof(pp));
pb += bs;
}
}
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0); assert(k % QK == 0);
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
const int nb = k / QK; const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2; const size_t bs = sizeof(float) + QK/2;
@ -413,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
uint8_t pp[QK/2]; uint8_t pp[QK/2];
#endif
#if __ARM_NEON #if __ARM_NEON
#if QK == 32 #if QK == 32
@ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
#endif #endif
#else #else
// scalar // scalar
for (int i = 0; i < nb; i++) { quantize_row_q4_0_reference(x, y, k);
float amax = 0.0f; // absolute max
for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *)pd = d;
pd += bs;
for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, sizeof(pp));
pb += bs;
}
#endif #endif
} }
@ -10705,121 +10723,62 @@ enum ggml_opt_result ggml_opt(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk; const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs; const size_t row_size = nb*bs;
assert(k % qk == 0); assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);
char * pdst = (char *) dst; char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) { for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
quantize_row_q4_0_reference(src + j, pd, k);
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
pd += bs;
for (int l = 0; l < qk; l += 2) { for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0])*id; const uint8_t vi0 = pb[l/2] & 0xF;
const float v1 = (src[j + i*qk + l + 1])*id; const uint8_t vi1 = pb[l/2] >> 4;
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++; hist[vi0]++;
hist[vi1]++; hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
} }
memcpy(pb, pp, pp_size);
pb += bs; pb += bs;
} }
} }
}
return (n/k)*row_size; return (n/k)*row_size;
} }
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk; const int nb = k / qk;
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs; const size_t row_size = nb*bs;
assert(k % qk == 0); assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);
char * pdst = (char *) dst; char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) { for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); quantize_row_q4_1(src + j, pd, k);
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pm += bs;
for (int l = 0; l < qk; l += 2) { for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id; const uint8_t vi0 = pb[l/2] & 0xF;
const float v1 = (src[j + i*qk + l + 1] - min)*id; const uint8_t vi1 = pb[l/2] >> 4;
const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++; hist[vi0]++;
hist[vi1]++; hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
} }
memcpy(pb, pp, pp_size);
pb += bs; pb += bs;
} }
} }
}
return (n/k)*row_size; return (n/k)*row_size;
} }

4
ggml.h
View File

@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
// quantization // quantization
// //
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
// //
// system info // system info

View File

@ -1,4 +1,9 @@
set(TEST_TARGET test-tokenizer-0) function(llama_add_test source)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()
llama_add_test(test-quantize.c)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

42
tests/test-quantize.c Normal file
View File

@ -0,0 +1,42 @@
#include "ggml.h"
#undef NDEBUG
#include <assert.h>
#include <math.h>
int main(void) {
#define QK 32
float src[QK];
uint8_t dst[24];
int64_t hist[16];
for (int i = 0; i < QK; i++) {
src[i] = (float)(i + 1);
}
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
assert(size == 20);
float max_result = ((float *)dst)[0];
float max_expected = src[31] / ((1 << 3) - 1);
assert(max_result == max_expected);
for (int i = 0; i < QK; i++) {
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
assert(q4_result == q4_expected);
}
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
assert(size == 24);
float delta_result = ((float *)dst)[0];
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
assert(delta_result == delta_expected);
float min_result = ((float *)dst)[1];
float min_expected = src[0];
assert(min_result == min_expected);
for (int i = 0; i < QK; i++) {
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
assert(q4_result == q4_expected);
}
return 0;
}