mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
iq3s_mult: quantization tuning
This commit is contained in:
parent
5b9c8785fa
commit
8b713a987e
@ -12092,7 +12092,11 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
float d = max_scale/31;
|
float d = max_scale/31;
|
||||||
y[ibl].d = GGML_FP32_TO_FP16(d * 1.025f); //1.033f);
|
#ifdef IQ3S_SLOW_MULT
|
||||||
|
y[ibl].d = GGML_FP32_TO_FP16(d * 1.025f);
|
||||||
|
#else
|
||||||
|
y[ibl].d = GGML_FP32_TO_FP16(d * 1.030f);
|
||||||
|
#endif
|
||||||
float id = 1/d;
|
float id = 1/d;
|
||||||
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
||||||
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
||||||
|
Loading…
Reference in New Issue
Block a user