mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-30 13:53:03 +01:00
iq3_s_mult: scalar dot product
This commit is contained in:
parent
f2c2bd6b26
commit
b48bf8b411
106
ggml-quants.c
106
ggml-quants.c
@ -3789,73 +3789,6 @@ static const uint32_t iq3xxs_grid[256] = {
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const uint32_t iq3xs_grid[512] = {
|
||||
0x04040404, 0x04142c14, 0x042c2424, 0x0404143c, 0x04140c0c, 0x042c0424, 0x04043434, 0x041c240c,
|
||||
0x04341c1c, 0x040c0c34, 0x041c0404, 0x0434341c, 0x040c2c2c, 0x04241c04, 0x043c1414, 0x0414042c,
|
||||
0x04243c04, 0x04042c14, 0x04142424, 0x042c143c, 0x04040c0c, 0x0c1c0424, 0x0c2c3434, 0x0c04240c,
|
||||
0x0c1c141c, 0x0c340c34, 0x0c0c0404, 0x0c24341c, 0x0c34242c, 0x0c0c1c04, 0x0c240c14, 0x0c3c042c,
|
||||
0x0c143404, 0x0c2c2c14, 0x14041c24, 0x1414143c, 0x142c040c, 0x14043c24, 0x141c2c34, 0x1434240c,
|
||||
0x140c141c, 0x141c0c34, 0x14340404, 0x140c341c, 0x1424242c, 0x143c1c04, 0x14140c14, 0x1424042c,
|
||||
0x1c043404, 0x1c142414, 0x1c2c1c24, 0x1c040c3c, 0x1c1c040c, 0x1c2c3424, 0x1c042c34, 0x1c1c1c0c,
|
||||
0x1c34141c, 0x1c0c0434, 0x1c243c04, 0x1c342c1c, 0x1c0c242c, 0x1c241404, 0x243c0c14, 0x2414042c,
|
||||
0x242c3404, 0x24042414, 0x24141c24, 0x242c0c3c, 0x2404040c, 0x241c3424, 0x24342434, 0x24041c0c,
|
||||
0x241c0c1c, 0x24340434, 0x240c3404, 0x2c242c1c, 0x2c3c1c2c, 0x2c141404, 0x2c240414, 0x2c043c2c,
|
||||
0x2c142c04, 0x2c2c2414, 0x2c041424, 0x2c1c0c3c, 0x2c2c040c, 0x2c043424, 0x2c1c2434, 0x2c341c0c,
|
||||
0x2c0c0c1c, 0x34240434, 0x34343404, 0x340c2c1c, 0x34241c2c, 0x343c1404, 0x34140414, 0x342c342c,
|
||||
0x34042c04, 0x34141c14, 0x342c1424, 0x3404043c, 0x341c3c0c, 0x34342c24, 0x3c042434, 0x3c1c140c,
|
||||
0x3c340c1c, 0x3c0c0434, 0x3c243404, 0x3c3c241c, 0x3c0c1c2c, 0x04240c04, 0x04040414, 0x0414342c,
|
||||
0x042c2c04, 0x04041c14, 0x041c1424, 0x042c043c, 0x04043c0c, 0x041c2c24, 0x04341c34, 0x040c140c,
|
||||
0x0424041c, 0x04343c34, 0x040c2c04, 0x0424241c, 0x043c142c, 0x04140c04, 0x042c0414, 0x0404342c,
|
||||
0x04142404, 0x042c1c14, 0x0c040c24, 0x0c1c043c, 0x0c34340c, 0x0c042c24, 0x0c1c1c34, 0x0c34140c,
|
||||
0x0c0c041c, 0x0c243c34, 0x0c3c2c04, 0x0c0c241c, 0x0c24142c, 0x0c040404, 0x0c143c14, 0x142c2c2c,
|
||||
0x14042404, 0x14141414, 0x142c0c24, 0x1404043c, 0x141c340c, 0x14342424, 0x140c1c34, 0x14240c0c,
|
||||
0x1434041c, 0x140c3434, 0x14242c04, 0x143c1c1c, 0x1414142c, 0x1c2c0404, 0x1c043c14, 0x1c142c2c,
|
||||
0x1c2c2404, 0x1c041414, 0x1c1c0c24, 0x1c343c3c, 0x1c042c0c, 0x1c1c2424, 0x1c341434, 0x1c0c0c0c,
|
||||
0x1c24041c, 0x1c3c3434, 0x240c2404, 0x24241c1c, 0x24040c2c, 0x24140404, 0x242c3414, 0x24042c2c,
|
||||
0x24141c04, 0x242c1414, 0x24040424, 0x241c3c3c, 0x24342c0c, 0x240c2424, 0x241c1434, 0x24340c0c,
|
||||
0x2c0c041c, 0x2c243434, 0x2c3c2404, 0x2c14141c, 0x2c2c0c2c, 0x2c040404, 0x2c143414, 0x2c2c242c,
|
||||
0x2c041c04, 0x2c1c0c14, 0x2c340424, 0x2c04343c, 0x2c1c2c0c, 0x2c341c24, 0x340c1434, 0x3424040c,
|
||||
0x343c3c1c, 0x340c2c34, 0x34242404, 0x3404141c, 0x34140c2c, 0x342c0404, 0x34043414, 0x3414242c,
|
||||
0x342c1c04, 0x34040c14, 0x341c0424, 0x3c34343c, 0x3c0c240c, 0x3c1c1c24, 0x3c340c34, 0x3c0c040c,
|
||||
0x3c24341c, 0x3c3c2c34, 0x04141c04, 0x0424141c, 0x0404042c, 0x04143c04, 0x042c2c14, 0x0404242c,
|
||||
0x041c1404, 0x04340c14, 0x04040424, 0x041c343c, 0x0434240c, 0x040c1c24, 0x04240c34, 0x043c040c,
|
||||
0x040c341c, 0x04242434, 0x04041c04, 0x04140c1c, 0x042c042c, 0x04043404, 0x0c142c14, 0x0c2c1c2c,
|
||||
0x0c041404, 0x0c1c0414, 0x0c343c24, 0x0c0c2c3c, 0x0c1c240c, 0x0c341424, 0x0c0c0c34, 0x0c24040c,
|
||||
0x0c3c341c, 0x0c142434, 0x0c241c04, 0x0c040c1c, 0x1414042c, 0x142c3404, 0x14042c14, 0x141c1c2c,
|
||||
0x142c1404, 0x14040414, 0x141c3424, 0x14342c3c, 0x140c1c0c, 0x14241424, 0x143c0434, 0x140c3c0c,
|
||||
0x14242c1c, 0x1c042434, 0x1c141404, 0x1c2c0c1c, 0x1c04042c, 0x1c143404, 0x1c2c2414, 0x1c041c2c,
|
||||
0x1c1c0c04, 0x1c340414, 0x1c0c3424, 0x1c1c2c3c, 0x1c341c0c, 0x1c0c1424, 0x1c240434, 0x243c3c0c,
|
||||
0x24142c1c, 0x24241c34, 0x24041404, 0x2414041c, 0x242c3c2c, 0x24042c04, 0x241c2414, 0x242c142c,
|
||||
0x24040c04, 0x241c0414, 0x24343424, 0x240c243c, 0x24241c0c, 0x2c340c24, 0x2c0c0434, 0x2c24340c,
|
||||
0x2c3c2c1c, 0x2c141c34, 0x2c2c1404, 0x2c04041c, 0x2c143c2c, 0x2c2c2c04, 0x2c042414, 0x2c1c142c,
|
||||
0x2c340404, 0x2c0c3c14, 0x341c2c24, 0x3434243c, 0x340c140c, 0x34240c24, 0x343c0434, 0x3414340c,
|
||||
0x3424241c, 0x34041c34, 0x34140c04, 0x342c041c, 0x3404342c, 0x341c2c04, 0x342c1c14, 0x3404142c,
|
||||
0x3c1c0404, 0x3c343c14, 0x3c0c2c24, 0x3c24243c, 0x3c34140c, 0x3c0c0c24, 0x3c243c34, 0x043c2c0c,
|
||||
0x0414241c, 0x042c1434, 0x04040c04, 0x0414041c, 0x042c342c, 0x04042404, 0x041c1c14, 0x04340c2c,
|
||||
0x040c0404, 0x041c3414, 0x04342c24, 0x040c1c3c, 0x0424140c, 0x043c0424, 0x04143c34, 0x04242c0c,
|
||||
0x0404241c, 0x04141434, 0x042c0c04, 0x0c04041c, 0x0c1c342c, 0x0c2c2404, 0x0c041414, 0x0c1c0c2c,
|
||||
0x0c340404, 0x0c0c3414, 0x0c242424, 0x0c341c3c, 0x0c0c0c0c, 0x0c240424, 0x0c3c3434, 0x0c142c0c,
|
||||
0x0c2c1c1c, 0x14041434, 0x14140404, 0x142c3c1c, 0x14042c2c, 0x141c2404, 0x14341414, 0x14040c2c,
|
||||
0x141c0404, 0x14343414, 0x140c2424, 0x14241c3c, 0x143c0c0c, 0x14140424, 0x1c243434, 0x1c04240c,
|
||||
0x1c141c1c, 0x1c2c0c34, 0x1c040404, 0x1c1c341c, 0x1c2c2c2c, 0x1c041c04, 0x1c1c1414, 0x1c34042c,
|
||||
0x1c0c3c04, 0x1c242c14, 0x1c342424, 0x1c0c143c, 0x24240c0c, 0x243c0424, 0x24143434, 0x242c240c,
|
||||
0x24041c1c, 0x24140c34, 0x242c0404, 0x2404341c, 0x241c242c, 0x24341c04, 0x24040c14, 0x241c042c,
|
||||
0x24343404, 0x2c0c2c14, 0x2c241c24, 0x2c3c143c, 0x2c0c040c, 0x2c243c24, 0x2c042c34, 0x2c14240c,
|
||||
0x2c2c141c, 0x2c040c34, 0x2c1c0404, 0x2c2c341c, 0x2c04242c, 0x2c1c1c04, 0x2c340c14, 0x340c042c,
|
||||
0x34243404, 0x34342c14, 0x340c1c24, 0x34240c3c, 0x343c040c, 0x34143424, 0x342c2c34, 0x34041c0c,
|
||||
0x3414141c, 0x342c0434, 0x34043c04, 0x341c2c1c, 0x3434242c, 0x3c041404, 0x3c1c0c14, 0x3c34042c,
|
||||
0x3c0c3404, 0x3c242414, 0x3c3c1c24, 0x040c0c3c, 0x0424040c, 0x04043424, 0x04142c34, 0x042c1c0c,
|
||||
0x0404141c, 0x04140434, 0x042c3c04, 0x04042c1c, 0x041c1c2c, 0x04341404, 0x040c0414, 0x041c3c2c,
|
||||
0x04342c04, 0x040c2414, 0x04241424, 0x043c0c3c, 0x0414040c, 0x042c3424, 0x04042434, 0x04141c0c,
|
||||
0x0c2c0c1c, 0x0c040434, 0x0c1c3404, 0x0c342c1c, 0x0c041c2c, 0x0c1c1404, 0x0c340414, 0x0c0c3c2c,
|
||||
0x0c242c04, 0x0c3c2414, 0x0c0c1424, 0x0c24043c, 0x0c043c0c, 0x14142c24, 0x142c2434, 0x1404140c,
|
||||
0x14140c1c, 0x142c0434, 0x14043404, 0x141c241c, 0x14341c2c, 0x140c0c04, 0x141c0414, 0x1434342c,
|
||||
0x140c2c04, 0x14241c14, 0x143c1424, 0x1c14043c, 0x1c243c0c, 0x1c042c24, 0x1c142434, 0x1c2c140c,
|
||||
0x1c040c1c, 0x1c1c3c34, 0x1c342c04, 0x1c04241c, 0x1c1c142c, 0x1c340c04, 0x1c0c0414, 0x1c24342c,
|
||||
0x1c3c2404, 0x240c1c14, 0x24240c24, 0x2404043c, 0x2414340c, 0x242c2c24, 0x24041c34, 0x2414140c,
|
||||
0x242c041c, 0x24043c34, 0x241c2c04, 0x2434241c, 0x240c142c, 0x241c0c04, 0x2c340414, 0x2c0c342c,
|
||||
};
|
||||
|
||||
#define NGRID_IQ2XXS 512
|
||||
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
||||
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||
@ -10214,6 +10147,9 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * grid = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
@ -10227,12 +10163,19 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = (IQ3S_MULTIPLIER * (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))) & 0x0f0f0f0f;
|
||||
aux32[1] = (IQ3S_MULTIPLIER * (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))) & 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += (2*((grid[j]-1)/2) + 1) * q8[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
#else
|
||||
aux32[0] = ((IQ3S_MULTIPLIER * (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = ((IQ3S_MULTIPLIER * (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))) & 0x0f0f0f0f) | 0x01010101;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
#endif
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
@ -10240,11 +10183,18 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = (IQ3S_MULTIPLIER * (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))) & 0x0f0f0f0f;
|
||||
aux32[1] = (IQ3S_MULTIPLIER * (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))) & 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += (2*((grid[j]-1)/2) + 1) * q8[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
#else
|
||||
aux32[0] = ((IQ3S_MULTIPLIER * (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = ((IQ3S_MULTIPLIER * (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))) & 0x0f0f0f0f) | 0x01010101;
|
||||
#endif
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
@ -10254,7 +10204,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user