mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 20:43:07 +01:00
WIP: make i-quants work for QK_K = 64
This commit is contained in:
parent
0becb22ac0
commit
13ba37f1aa
@ -4227,6 +4227,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
||||
|
||||
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
#if QK_K == 64
|
||||
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
||||
#else
|
||||
const int nb = k / QK_K;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
@ -4246,6 +4249,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
||||
qs += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
@ -10455,6 +10459,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_K == 0);
|
||||
#if QK_K == 64
|
||||
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#else
|
||||
|
||||
const block_iq4_xs * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
@ -10574,6 +10581,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================ IQ2 quantization =============================================
|
||||
@ -10921,7 +10929,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
||||
const int nbl = n/256;
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq2_xxs * y = vy;
|
||||
|
||||
@ -11094,7 +11102,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
||||
const int nbl = n/256;
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq2_xs * y = vy;
|
||||
|
||||
@ -12037,7 +12045,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int nbl = n/256;
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq1_s * y = vy;
|
||||
|
||||
@ -12315,6 +12323,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
||||
}
|
||||
|
||||
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
#if QK_K == 64
|
||||
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
||||
#else
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
@ -12333,6 +12344,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i
|
||||
qrow += nblock*sizeof(block_iq4_xs);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq4_xs);
|
||||
#endif
|
||||
}
|
||||
|
||||
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
||||
@ -12363,7 +12375,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
||||
const int nbl = n/256;
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq2_s * y = vy;
|
||||
|
||||
|
@ -230,6 +230,10 @@ typedef struct {
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#if QK_K == 64
|
||||
#define block_iq4_xs block_iq4_nl
|
||||
//typedef struct block_iq4_nl block_iq4_xs;
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint16_t scales_h;
|
||||
@ -237,6 +241,7 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2];
|
||||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
15
ggml.c
15
ggml.c
@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
},
|
||||
[GGML_TYPE_IQ4_XS] = {
|
||||
.type_name = "iq4_xs",
|
||||
#if QK_K == 64
|
||||
.blck_size = QK4_NL,
|
||||
#else
|
||||
.blck_size = QK_K,
|
||||
#endif
|
||||
.type_size = sizeof(block_iq4_xs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
#if QK_K == 64
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
#endif
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
#if QK_K == 64
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
#endif
|
||||
{
|
||||
GGML_ASSERT(start % QK4_NL == 0);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
#if QK_K != 64
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
{
|
||||
GGML_ASSERT(start % QK4_NL == 0);
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
#endif
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
Loading…
Reference in New Issue
Block a user