mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-05 16:10:42 +01:00
Make quantize_row_iq4_nl do the same thing is quantization on CUDA
This time for real. backend-ops tests pass.
This commit is contained in:
parent
cd4a7c4cb4
commit
30eef31b07
@ -11718,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|||||||
float max_scale = 0, amax_scale = 0;
|
float max_scale = 0, amax_scale = 0;
|
||||||
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
||||||
const float * xb = x + ib*block_size;
|
const float * xb = x + ib*block_size;
|
||||||
|
uint8_t * Lb = L + ib*block_size;
|
||||||
if (quant_weights) {
|
if (quant_weights) {
|
||||||
const float * qw = quant_weights + ib*block_size;
|
const float * qw = quant_weights + ib*block_size;
|
||||||
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||||
@ -11735,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
float d = -max/values[0];
|
float d = ntry > 0 ? -max/values[0] : max/values[0];
|
||||||
float id = 1/d;
|
float id = 1/d;
|
||||||
float sumqx = 0, sumq2 = 0;
|
float sumqx = 0, sumq2 = 0;
|
||||||
for (int j = 0; j < block_size; ++j) {
|
for (int j = 0; j < block_size; ++j) {
|
||||||
float al = id*xb[j];
|
float al = id*xb[j];
|
||||||
int l = best_index_int8(16, values, al);
|
int l = best_index_int8(16, values, al);
|
||||||
|
Lb[j] = l;
|
||||||
float q = values[l];
|
float q = values[l];
|
||||||
float w = weight[j];
|
float w = weight[j];
|
||||||
sumqx += w*q*xb[j];
|
sumqx += w*q*xb[j];
|
||||||
@ -11795,11 +11797,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
||||||
|
if (ntry > 0) {
|
||||||
float id = scales[0] ? 1/scales[0] : 0;
|
float id = scales[0] ? 1/scales[0] : 0;
|
||||||
for (int j = 0; j < super_block_size; ++j) {
|
for (int j = 0; j < super_block_size; ++j) {
|
||||||
L[j] = best_index_int8(16, values, id*x[j]);
|
L[j] = best_index_int8(16, values, id*x[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < super_block_size/32; ++i) {
|
for (int i = 0; i < super_block_size/32; ++i) {
|
||||||
for (int j = 0; j < 16; ++j) {
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
Loading…
Reference in New Issue
Block a user