mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 15:18:26 +01:00
iq3_s_mult: play with blocks of 16
This brings the bpw to 3.5625. We come close but don't quite match lookup with 3.4375 bpw (blocks of 32)
This commit is contained in:
parent
dbe98dfe70
commit
f4cb4eac45
@ -544,14 +544,15 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
||||
|
||||
#define QR3_XS 8
|
||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||
#define IQ3S_BLOCK_SIZE 16
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[QK_K/64];
|
||||
uint8_t scales[QK_K/(2*IQ3S_BLOCK_SIZE)];
|
||||
} block_iq3_s;
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding");
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + QK_K/(2*IQ3S_BLOCK_SIZE), "wrong iq3_s block size/padding");
|
||||
|
||||
#define QR1_S 8
|
||||
#define QI1_S (QK_K / (4*QR1_S))
|
||||
@ -2392,7 +2393,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
const uint8_t * qs = x[i].qs + 8*ib;
|
||||
int32_t aux32[2];
|
||||
const int8_t * grid = (const int8_t *)aux32;
|
||||
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
||||
const int is = (32*ib + 8*il)/IQ3S_BLOCK_SIZE;
|
||||
const float d = (float)x[i].d * (1 + 2*((x[i].scales[is/2] >> 4*(is%2)) & 0xf));
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
|
@ -4138,6 +4138,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
||||
uint32_t aux32[2];
|
||||
const int8_t * grid = (const int8_t *)aux32;
|
||||
|
||||
float db[64/IQ3S_BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
@ -4146,20 +4148,28 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
||||
const uint8_t * signs = x[i].signs;
|
||||
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
||||
const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
||||
#if IQ3S_BLOCK_SIZE == 16
|
||||
db[0] = d * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
||||
db[1] = d * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
||||
db[2] = d * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
||||
db[3] = d * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
||||
#else
|
||||
db[0] = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
||||
db[1] = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
||||
#endif
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const float dl = db[8*l/IQ3S_BLOCK_SIZE];
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = ((qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
aux32[1] = ((qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = db1 * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
y[j] = dl * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
aux32[0] = (((qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = (((qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = db1 * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
#endif
|
||||
y += 8;
|
||||
@ -4167,18 +4177,19 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const float dl = db[(8*l+32)/IQ3S_BLOCK_SIZE];
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = ((qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
aux32[1] = ((qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = db2 * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
y[j] = dl * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
aux32[0] = (((qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = (((qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
#endif
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = db2 * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
@ -12109,7 +12120,6 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
||||
}
|
||||
}
|
||||
|
||||
#define IQ3S_BLOCK_SIZE 32
|
||||
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
|
@ -201,10 +201,11 @@ typedef struct {
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// 3.4375 bpw
|
||||
#define IQ3S_BLOCK_SIZE 16
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#define IQ3S_N_SCALE QK_K/(2*IQ3S_BLOCK_SIZE)
|
||||
#endif
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
|
Loading…
Reference in New Issue
Block a user