mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 03:12:23 +01:00
ggml : fix quants nans when all the group weights are very close to zero (#7313)
This commit is contained in:
parent
ef277de2ad
commit
05834841dc
@ -14,6 +14,12 @@
|
|||||||
#include <stdlib.h> // for qsort
|
#include <stdlib.h> // for qsort
|
||||||
#include <stdio.h> // for GGML_ASSERT
|
#include <stdio.h> // for GGML_ASSERT
|
||||||
|
|
||||||
|
#define GROUP_MAX_EPS 1e-15f
|
||||||
|
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
|
||||||
|
#define GROUP_MAX_EPS_IQ2_S 1e-8f
|
||||||
|
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||||
|
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
@ -1109,7 +1115,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|||||||
float ax = fabsf(x[i]);
|
float ax = fabsf(x[i]);
|
||||||
if (ax > amax) { amax = ax; max = x[i]; }
|
if (ax > amax) { amax = ax; max = x[i]; }
|
||||||
}
|
}
|
||||||
if (amax < 1e-30f) { // all zero
|
if (amax < GROUP_MAX_EPS) { // all zero
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
L[i] = 0;
|
L[i] = 0;
|
||||||
}
|
}
|
||||||
@ -1177,7 +1183,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|||||||
float ax = fabsf(x[i]);
|
float ax = fabsf(x[i]);
|
||||||
if (ax > amax) { amax = ax; max = x[i]; }
|
if (ax > amax) { amax = ax; max = x[i]; }
|
||||||
}
|
}
|
||||||
if (!amax) { // all zero
|
if (amax < GROUP_MAX_EPS) { // all zero
|
||||||
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
||||||
return 0.f;
|
return 0.f;
|
||||||
}
|
}
|
||||||
@ -1646,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sumlx / suml2;
|
return sumlx/suml2;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
||||||
@ -2653,7 +2659,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!max_abs_scale) {
|
if (max_abs_scale < GROUP_MAX_EPS) {
|
||||||
memset(&y[i], 0, sizeof(block_q6_K));
|
memset(&y[i], 0, sizeof(block_q6_K));
|
||||||
y[i].d = GGML_FP32_TO_FP16(0.f);
|
y[i].d = GGML_FP32_TO_FP16(0.f);
|
||||||
x += QK_K;
|
x += QK_K;
|
||||||
@ -2805,7 +2811,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!max_abs_scale) {
|
if (max_abs_scale < GROUP_MAX_EPS) {
|
||||||
memset(&y[i], 0, sizeof(block_q6_K));
|
memset(&y[i], 0, sizeof(block_q6_K));
|
||||||
y[i].d = GGML_FP32_TO_FP16(0.f);
|
y[i].d = GGML_FP32_TO_FP16(0.f);
|
||||||
x += QK_K;
|
x += QK_K;
|
||||||
@ -12599,7 +12605,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|||||||
}
|
}
|
||||||
float max = xval[0];
|
float max = xval[0];
|
||||||
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
memset(L, 0, 32);
|
memset(L, 0, 32);
|
||||||
continue;
|
continue;
|
||||||
@ -12775,7 +12781,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|||||||
}
|
}
|
||||||
float max = xval[0];
|
float max = xval[0];
|
||||||
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
memset(L, 0, 16);
|
memset(L, 0, 16);
|
||||||
continue;
|
continue;
|
||||||
@ -13216,7 +13222,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|||||||
}
|
}
|
||||||
float max = xval[0];
|
float max = xval[0];
|
||||||
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS_IQ3_XXS) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
memset(L, 0, 32);
|
memset(L, 0, 32);
|
||||||
continue;
|
continue;
|
||||||
@ -13756,7 +13762,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|||||||
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||||
float max = fabsf(xb[0]);
|
float max = fabsf(xb[0]);
|
||||||
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS_IQ1_S) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
memset(L, 1, block_size);
|
memset(L, 1, block_size);
|
||||||
continue;
|
continue;
|
||||||
@ -13944,7 +13950,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|||||||
}
|
}
|
||||||
float max = fabsf(xb[0]);
|
float max = fabsf(xb[0]);
|
||||||
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS_IQ1_M) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
memset(L, 1, block_size);
|
memset(L, 1, block_size);
|
||||||
continue;
|
continue;
|
||||||
@ -14208,7 +14214,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|||||||
amax = ax; max = xb[j];
|
amax = ax; max = xb[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!amax) {
|
if (amax < GROUP_MAX_EPS) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -14429,7 +14435,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|||||||
}
|
}
|
||||||
float max = xval[0];
|
float max = xval[0];
|
||||||
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
||||||
if (!max) {
|
if (max < GROUP_MAX_EPS_IQ2_S) {
|
||||||
scales[ib] = 0;
|
scales[ib] = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||||
// static RNG initialization (revisit if n_threads stops being constant)
|
// static RNG initialization (revisit if n_threads stops being constant)
|
||||||
static const size_t n_threads = std::thread::hardware_concurrency();
|
static const size_t n_threads = std::thread::hardware_concurrency();
|
||||||
@ -49,6 +50,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||||||
t.join();
|
t.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
const char * val_str = getenv("GGML_TEST_EPS");
|
||||||
|
float val = 1e-9f;
|
||||||
|
if (val_str != nullptr) {
|
||||||
|
val = std::stof(val_str);
|
||||||
|
printf("GGML_TEST_EPS=%e\n", val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// test quantization with very small values that may result in nan scales due to division by zero
|
||||||
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
data[i] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||||
@ -64,6 +81,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||||
|
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||||
// This is going to create some weird integers though.
|
// This is going to create some weird integers though.
|
||||||
|
Loading…
Reference in New Issue
Block a user