From 9c752ff0d3a9e98c550e44083af114cc28a3b907 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 1 Mar 2024 11:52:17 +0200 Subject: [PATCH] Trying IQ3_S without a lookup table --- ggml-cuda.cu | 185 ++++++++++++++---------- ggml-quants.c | 391 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 384 insertions(+), 192 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dfd28df62..a58214557 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2008,72 +2008,72 @@ static const __device__ uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; -static const __device__ uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, -}; +//static const __device__ uint32_t iq3xs_grid[512] = { +//0x04040404, 0x04142c14, 0x042c2424, 0x0404143c, 0x04140c0c, 0x042c0424, 0x04043434, 0x041c240c, +//0x04341c1c, 0x040c0c34, 0x041c0404, 0x0434341c, 0x040c2c2c, 0x04241c04, 0x043c1414, 0x0414042c, +//0x04243c04, 0x04042c14, 0x04142424, 0x042c143c, 0x04040c0c, 0x0c1c0424, 0x0c2c3434, 0x0c04240c, +//0x0c1c141c, 0x0c340c34, 0x0c0c0404, 0x0c24341c, 0x0c34242c, 0x0c0c1c04, 0x0c240c14, 0x0c3c042c, +//0x0c143404, 0x0c2c2c14, 0x14041c24, 0x1414143c, 0x142c040c, 0x14043c24, 0x141c2c34, 0x1434240c, +//0x140c141c, 0x141c0c34, 0x14340404, 0x140c341c, 0x1424242c, 0x143c1c04, 0x14140c14, 0x1424042c, +//0x1c043404, 0x1c142414, 0x1c2c1c24, 0x1c040c3c, 0x1c1c040c, 0x1c2c3424, 0x1c042c34, 0x1c1c1c0c, +//0x1c34141c, 0x1c0c0434, 0x1c243c04, 0x1c342c1c, 0x1c0c242c, 0x1c241404, 0x243c0c14, 0x2414042c, +//0x242c3404, 0x24042414, 0x24141c24, 0x242c0c3c, 0x2404040c, 0x241c3424, 0x24342434, 0x24041c0c, +//0x241c0c1c, 0x24340434, 0x240c3404, 0x2c242c1c, 0x2c3c1c2c, 0x2c141404, 0x2c240414, 0x2c043c2c, +//0x2c142c04, 0x2c2c2414, 0x2c041424, 0x2c1c0c3c, 0x2c2c040c, 0x2c043424, 0x2c1c2434, 0x2c341c0c, +//0x2c0c0c1c, 0x34240434, 0x34343404, 0x340c2c1c, 0x34241c2c, 0x343c1404, 0x34140414, 0x342c342c, +//0x34042c04, 0x34141c14, 0x342c1424, 0x3404043c, 0x341c3c0c, 0x34342c24, 0x3c042434, 0x3c1c140c, +//0x3c340c1c, 0x3c0c0434, 0x3c243404, 0x3c3c241c, 0x3c0c1c2c, 0x04240c04, 0x04040414, 0x0414342c, +//0x042c2c04, 0x04041c14, 0x041c1424, 0x042c043c, 0x04043c0c, 0x041c2c24, 0x04341c34, 0x040c140c, +//0x0424041c, 0x04343c34, 0x040c2c04, 0x0424241c, 0x043c142c, 0x04140c04, 0x042c0414, 0x0404342c, +//0x04142404, 0x042c1c14, 0x0c040c24, 0x0c1c043c, 0x0c34340c, 0x0c042c24, 0x0c1c1c34, 0x0c34140c, +//0x0c0c041c, 0x0c243c34, 0x0c3c2c04, 0x0c0c241c, 0x0c24142c, 0x0c040404, 0x0c143c14, 0x142c2c2c, +//0x14042404, 0x14141414, 0x142c0c24, 0x1404043c, 0x141c340c, 0x14342424, 0x140c1c34, 0x14240c0c, +//0x1434041c, 0x140c3434, 0x14242c04, 0x143c1c1c, 0x1414142c, 0x1c2c0404, 0x1c043c14, 0x1c142c2c, +//0x1c2c2404, 0x1c041414, 0x1c1c0c24, 0x1c343c3c, 0x1c042c0c, 0x1c1c2424, 0x1c341434, 0x1c0c0c0c, +//0x1c24041c, 0x1c3c3434, 0x240c2404, 0x24241c1c, 0x24040c2c, 0x24140404, 0x242c3414, 0x24042c2c, +//0x24141c04, 0x242c1414, 0x24040424, 0x241c3c3c, 0x24342c0c, 0x240c2424, 0x241c1434, 0x24340c0c, +//0x2c0c041c, 0x2c243434, 0x2c3c2404, 0x2c14141c, 0x2c2c0c2c, 0x2c040404, 0x2c143414, 0x2c2c242c, +//0x2c041c04, 0x2c1c0c14, 0x2c340424, 0x2c04343c, 0x2c1c2c0c, 0x2c341c24, 0x340c1434, 0x3424040c, +//0x343c3c1c, 0x340c2c34, 0x34242404, 0x3404141c, 0x34140c2c, 0x342c0404, 0x34043414, 0x3414242c, +//0x342c1c04, 0x34040c14, 0x341c0424, 0x3c34343c, 0x3c0c240c, 0x3c1c1c24, 0x3c340c34, 0x3c0c040c, +//0x3c24341c, 0x3c3c2c34, 0x04141c04, 0x0424141c, 0x0404042c, 0x04143c04, 0x042c2c14, 0x0404242c, +//0x041c1404, 0x04340c14, 0x04040424, 0x041c343c, 0x0434240c, 0x040c1c24, 0x04240c34, 0x043c040c, +//0x040c341c, 0x04242434, 0x04041c04, 0x04140c1c, 0x042c042c, 0x04043404, 0x0c142c14, 0x0c2c1c2c, +//0x0c041404, 0x0c1c0414, 0x0c343c24, 0x0c0c2c3c, 0x0c1c240c, 0x0c341424, 0x0c0c0c34, 0x0c24040c, +//0x0c3c341c, 0x0c142434, 0x0c241c04, 0x0c040c1c, 0x1414042c, 0x142c3404, 0x14042c14, 0x141c1c2c, +//0x142c1404, 0x14040414, 0x141c3424, 0x14342c3c, 0x140c1c0c, 0x14241424, 0x143c0434, 0x140c3c0c, +//0x14242c1c, 0x1c042434, 0x1c141404, 0x1c2c0c1c, 0x1c04042c, 0x1c143404, 0x1c2c2414, 0x1c041c2c, +//0x1c1c0c04, 0x1c340414, 0x1c0c3424, 0x1c1c2c3c, 0x1c341c0c, 0x1c0c1424, 0x1c240434, 0x243c3c0c, +//0x24142c1c, 0x24241c34, 0x24041404, 0x2414041c, 0x242c3c2c, 0x24042c04, 0x241c2414, 0x242c142c, +//0x24040c04, 0x241c0414, 0x24343424, 0x240c243c, 0x24241c0c, 0x2c340c24, 0x2c0c0434, 0x2c24340c, +//0x2c3c2c1c, 0x2c141c34, 0x2c2c1404, 0x2c04041c, 0x2c143c2c, 0x2c2c2c04, 0x2c042414, 0x2c1c142c, +//0x2c340404, 0x2c0c3c14, 0x341c2c24, 0x3434243c, 0x340c140c, 0x34240c24, 0x343c0434, 0x3414340c, +//0x3424241c, 0x34041c34, 0x34140c04, 0x342c041c, 0x3404342c, 0x341c2c04, 0x342c1c14, 0x3404142c, +//0x3c1c0404, 0x3c343c14, 0x3c0c2c24, 0x3c24243c, 0x3c34140c, 0x3c0c0c24, 0x3c243c34, 0x043c2c0c, +//0x0414241c, 0x042c1434, 0x04040c04, 0x0414041c, 0x042c342c, 0x04042404, 0x041c1c14, 0x04340c2c, +//0x040c0404, 0x041c3414, 0x04342c24, 0x040c1c3c, 0x0424140c, 0x043c0424, 0x04143c34, 0x04242c0c, +//0x0404241c, 0x04141434, 0x042c0c04, 0x0c04041c, 0x0c1c342c, 0x0c2c2404, 0x0c041414, 0x0c1c0c2c, +//0x0c340404, 0x0c0c3414, 0x0c242424, 0x0c341c3c, 0x0c0c0c0c, 0x0c240424, 0x0c3c3434, 0x0c142c0c, +//0x0c2c1c1c, 0x14041434, 0x14140404, 0x142c3c1c, 0x14042c2c, 0x141c2404, 0x14341414, 0x14040c2c, +//0x141c0404, 0x14343414, 0x140c2424, 0x14241c3c, 0x143c0c0c, 0x14140424, 0x1c243434, 0x1c04240c, +//0x1c141c1c, 0x1c2c0c34, 0x1c040404, 0x1c1c341c, 0x1c2c2c2c, 0x1c041c04, 0x1c1c1414, 0x1c34042c, +//0x1c0c3c04, 0x1c242c14, 0x1c342424, 0x1c0c143c, 0x24240c0c, 0x243c0424, 0x24143434, 0x242c240c, +//0x24041c1c, 0x24140c34, 0x242c0404, 0x2404341c, 0x241c242c, 0x24341c04, 0x24040c14, 0x241c042c, +//0x24343404, 0x2c0c2c14, 0x2c241c24, 0x2c3c143c, 0x2c0c040c, 0x2c243c24, 0x2c042c34, 0x2c14240c, +//0x2c2c141c, 0x2c040c34, 0x2c1c0404, 0x2c2c341c, 0x2c04242c, 0x2c1c1c04, 0x2c340c14, 0x340c042c, +//0x34243404, 0x34342c14, 0x340c1c24, 0x34240c3c, 0x343c040c, 0x34143424, 0x342c2c34, 0x34041c0c, +//0x3414141c, 0x342c0434, 0x34043c04, 0x341c2c1c, 0x3434242c, 0x3c041404, 0x3c1c0c14, 0x3c34042c, +//0x3c0c3404, 0x3c242414, 0x3c3c1c24, 0x040c0c3c, 0x0424040c, 0x04043424, 0x04142c34, 0x042c1c0c, +//0x0404141c, 0x04140434, 0x042c3c04, 0x04042c1c, 0x041c1c2c, 0x04341404, 0x040c0414, 0x041c3c2c, +//0x04342c04, 0x040c2414, 0x04241424, 0x043c0c3c, 0x0414040c, 0x042c3424, 0x04042434, 0x04141c0c, +//0x0c2c0c1c, 0x0c040434, 0x0c1c3404, 0x0c342c1c, 0x0c041c2c, 0x0c1c1404, 0x0c340414, 0x0c0c3c2c, +//0x0c242c04, 0x0c3c2414, 0x0c0c1424, 0x0c24043c, 0x0c043c0c, 0x14142c24, 0x142c2434, 0x1404140c, +//0x14140c1c, 0x142c0434, 0x14043404, 0x141c241c, 0x14341c2c, 0x140c0c04, 0x141c0414, 0x1434342c, +//0x140c2c04, 0x14241c14, 0x143c1424, 0x1c14043c, 0x1c243c0c, 0x1c042c24, 0x1c142434, 0x1c2c140c, +//0x1c040c1c, 0x1c1c3c34, 0x1c342c04, 0x1c04241c, 0x1c1c142c, 0x1c340c04, 0x1c0c0414, 0x1c24342c, +//0x1c3c2404, 0x240c1c14, 0x24240c24, 0x2404043c, 0x2414340c, 0x242c2c24, 0x24041c34, 0x2414140c, +//0x242c041c, 0x24043c34, 0x241c2c04, 0x2434241c, 0x240c142c, 0x241c0c04, 0x2c340414, 0x2c0c342c, +//}; static const __device__ uint64_t iq1s_grid[512] = { @@ -2370,6 +2370,12 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds } +//#define IQ3S_MULTIPLIER 2469109 +//#define IQ3S_MULTIPLIER 746226 +//#define IQ3S_MULTIPLIER 717154 +#define IQ3S_MULTIPLIER 677595 +//static const __device__ uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15}; + template static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -2382,14 +2388,25 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; + int32_t aux32[2]; + const uint8_t * grid = (const uint8_t *)aux32; + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); const uint8_t signs = x[i].signs[4*ib + il]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + //y[j] = d * (2*((grid[j]-1)/2) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + //y[j] = d * iq3s_values[grid[j]] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + y[j] = d * (2*(((grid[j]+1)/2) & 7) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f); } +// const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); +// const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); +// const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; +// const uint8_t signs = x[i].signs[4*ib + il]; +// for (int j = 0; j < 4; ++j) { +// y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); +// y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); +// } #else assert(false); #endif @@ -5196,22 +5213,36 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( #if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + uint32_t aux32[2]; + const uint8_t * grid = (const uint8_t *)aux32; + const int ib32 = iqs; const uint8_t * qs = bq2->qs + 8*ib32; const int8_t * q8 = bq8_1[ib32].qs; int sumi = 0; for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + aux32[0] = __vadd4(((__vadd4(aux32[0], 0x01010101) >> 1) & 0x07070707) << 1, 0x01010101); + aux32[1] = __vadd4(((__vadd4(aux32[1], 0x01010101) >> 1) & 0x07070707) << 1, 0x01010101); uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); + const int grid_l = __vsub4(aux32[0] ^ signs0, signs0); + const int grid_h = __vsub4(aux32[1] ^ signs1, signs1); sumi = __dp4a(grid_l, *((int *)q8+0), sumi); sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + //const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + //const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + //uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + //uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + //const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); + //const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); + //sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + //sumi = __dp4a(grid_h, *((int *)q8+1), sumi); q8 += 8; } - const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; + //const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; + const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); return d * sumi; #else assert(false); diff --git a/ggml-quants.c b/ggml-quants.c index f73d17ce2..9f9d299fe 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3790,70 +3790,70 @@ static const uint32_t iq3xxs_grid[256] = { }; static const uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +0x04040404, 0x04142c14, 0x042c2424, 0x0404143c, 0x04140c0c, 0x042c0424, 0x04043434, 0x041c240c, +0x04341c1c, 0x040c0c34, 0x041c0404, 0x0434341c, 0x040c2c2c, 0x04241c04, 0x043c1414, 0x0414042c, +0x04243c04, 0x04042c14, 0x04142424, 0x042c143c, 0x04040c0c, 0x0c1c0424, 0x0c2c3434, 0x0c04240c, +0x0c1c141c, 0x0c340c34, 0x0c0c0404, 0x0c24341c, 0x0c34242c, 0x0c0c1c04, 0x0c240c14, 0x0c3c042c, +0x0c143404, 0x0c2c2c14, 0x14041c24, 0x1414143c, 0x142c040c, 0x14043c24, 0x141c2c34, 0x1434240c, +0x140c141c, 0x141c0c34, 0x14340404, 0x140c341c, 0x1424242c, 0x143c1c04, 0x14140c14, 0x1424042c, +0x1c043404, 0x1c142414, 0x1c2c1c24, 0x1c040c3c, 0x1c1c040c, 0x1c2c3424, 0x1c042c34, 0x1c1c1c0c, +0x1c34141c, 0x1c0c0434, 0x1c243c04, 0x1c342c1c, 0x1c0c242c, 0x1c241404, 0x243c0c14, 0x2414042c, +0x242c3404, 0x24042414, 0x24141c24, 0x242c0c3c, 0x2404040c, 0x241c3424, 0x24342434, 0x24041c0c, +0x241c0c1c, 0x24340434, 0x240c3404, 0x2c242c1c, 0x2c3c1c2c, 0x2c141404, 0x2c240414, 0x2c043c2c, +0x2c142c04, 0x2c2c2414, 0x2c041424, 0x2c1c0c3c, 0x2c2c040c, 0x2c043424, 0x2c1c2434, 0x2c341c0c, +0x2c0c0c1c, 0x34240434, 0x34343404, 0x340c2c1c, 0x34241c2c, 0x343c1404, 0x34140414, 0x342c342c, +0x34042c04, 0x34141c14, 0x342c1424, 0x3404043c, 0x341c3c0c, 0x34342c24, 0x3c042434, 0x3c1c140c, +0x3c340c1c, 0x3c0c0434, 0x3c243404, 0x3c3c241c, 0x3c0c1c2c, 0x04240c04, 0x04040414, 0x0414342c, +0x042c2c04, 0x04041c14, 0x041c1424, 0x042c043c, 0x04043c0c, 0x041c2c24, 0x04341c34, 0x040c140c, +0x0424041c, 0x04343c34, 0x040c2c04, 0x0424241c, 0x043c142c, 0x04140c04, 0x042c0414, 0x0404342c, +0x04142404, 0x042c1c14, 0x0c040c24, 0x0c1c043c, 0x0c34340c, 0x0c042c24, 0x0c1c1c34, 0x0c34140c, +0x0c0c041c, 0x0c243c34, 0x0c3c2c04, 0x0c0c241c, 0x0c24142c, 0x0c040404, 0x0c143c14, 0x142c2c2c, +0x14042404, 0x14141414, 0x142c0c24, 0x1404043c, 0x141c340c, 0x14342424, 0x140c1c34, 0x14240c0c, +0x1434041c, 0x140c3434, 0x14242c04, 0x143c1c1c, 0x1414142c, 0x1c2c0404, 0x1c043c14, 0x1c142c2c, +0x1c2c2404, 0x1c041414, 0x1c1c0c24, 0x1c343c3c, 0x1c042c0c, 0x1c1c2424, 0x1c341434, 0x1c0c0c0c, +0x1c24041c, 0x1c3c3434, 0x240c2404, 0x24241c1c, 0x24040c2c, 0x24140404, 0x242c3414, 0x24042c2c, +0x24141c04, 0x242c1414, 0x24040424, 0x241c3c3c, 0x24342c0c, 0x240c2424, 0x241c1434, 0x24340c0c, +0x2c0c041c, 0x2c243434, 0x2c3c2404, 0x2c14141c, 0x2c2c0c2c, 0x2c040404, 0x2c143414, 0x2c2c242c, +0x2c041c04, 0x2c1c0c14, 0x2c340424, 0x2c04343c, 0x2c1c2c0c, 0x2c341c24, 0x340c1434, 0x3424040c, +0x343c3c1c, 0x340c2c34, 0x34242404, 0x3404141c, 0x34140c2c, 0x342c0404, 0x34043414, 0x3414242c, +0x342c1c04, 0x34040c14, 0x341c0424, 0x3c34343c, 0x3c0c240c, 0x3c1c1c24, 0x3c340c34, 0x3c0c040c, +0x3c24341c, 0x3c3c2c34, 0x04141c04, 0x0424141c, 0x0404042c, 0x04143c04, 0x042c2c14, 0x0404242c, +0x041c1404, 0x04340c14, 0x04040424, 0x041c343c, 0x0434240c, 0x040c1c24, 0x04240c34, 0x043c040c, +0x040c341c, 0x04242434, 0x04041c04, 0x04140c1c, 0x042c042c, 0x04043404, 0x0c142c14, 0x0c2c1c2c, +0x0c041404, 0x0c1c0414, 0x0c343c24, 0x0c0c2c3c, 0x0c1c240c, 0x0c341424, 0x0c0c0c34, 0x0c24040c, +0x0c3c341c, 0x0c142434, 0x0c241c04, 0x0c040c1c, 0x1414042c, 0x142c3404, 0x14042c14, 0x141c1c2c, +0x142c1404, 0x14040414, 0x141c3424, 0x14342c3c, 0x140c1c0c, 0x14241424, 0x143c0434, 0x140c3c0c, +0x14242c1c, 0x1c042434, 0x1c141404, 0x1c2c0c1c, 0x1c04042c, 0x1c143404, 0x1c2c2414, 0x1c041c2c, +0x1c1c0c04, 0x1c340414, 0x1c0c3424, 0x1c1c2c3c, 0x1c341c0c, 0x1c0c1424, 0x1c240434, 0x243c3c0c, +0x24142c1c, 0x24241c34, 0x24041404, 0x2414041c, 0x242c3c2c, 0x24042c04, 0x241c2414, 0x242c142c, +0x24040c04, 0x241c0414, 0x24343424, 0x240c243c, 0x24241c0c, 0x2c340c24, 0x2c0c0434, 0x2c24340c, +0x2c3c2c1c, 0x2c141c34, 0x2c2c1404, 0x2c04041c, 0x2c143c2c, 0x2c2c2c04, 0x2c042414, 0x2c1c142c, +0x2c340404, 0x2c0c3c14, 0x341c2c24, 0x3434243c, 0x340c140c, 0x34240c24, 0x343c0434, 0x3414340c, +0x3424241c, 0x34041c34, 0x34140c04, 0x342c041c, 0x3404342c, 0x341c2c04, 0x342c1c14, 0x3404142c, +0x3c1c0404, 0x3c343c14, 0x3c0c2c24, 0x3c24243c, 0x3c34140c, 0x3c0c0c24, 0x3c243c34, 0x043c2c0c, +0x0414241c, 0x042c1434, 0x04040c04, 0x0414041c, 0x042c342c, 0x04042404, 0x041c1c14, 0x04340c2c, +0x040c0404, 0x041c3414, 0x04342c24, 0x040c1c3c, 0x0424140c, 0x043c0424, 0x04143c34, 0x04242c0c, +0x0404241c, 0x04141434, 0x042c0c04, 0x0c04041c, 0x0c1c342c, 0x0c2c2404, 0x0c041414, 0x0c1c0c2c, +0x0c340404, 0x0c0c3414, 0x0c242424, 0x0c341c3c, 0x0c0c0c0c, 0x0c240424, 0x0c3c3434, 0x0c142c0c, +0x0c2c1c1c, 0x14041434, 0x14140404, 0x142c3c1c, 0x14042c2c, 0x141c2404, 0x14341414, 0x14040c2c, +0x141c0404, 0x14343414, 0x140c2424, 0x14241c3c, 0x143c0c0c, 0x14140424, 0x1c243434, 0x1c04240c, +0x1c141c1c, 0x1c2c0c34, 0x1c040404, 0x1c1c341c, 0x1c2c2c2c, 0x1c041c04, 0x1c1c1414, 0x1c34042c, +0x1c0c3c04, 0x1c242c14, 0x1c342424, 0x1c0c143c, 0x24240c0c, 0x243c0424, 0x24143434, 0x242c240c, +0x24041c1c, 0x24140c34, 0x242c0404, 0x2404341c, 0x241c242c, 0x24341c04, 0x24040c14, 0x241c042c, +0x24343404, 0x2c0c2c14, 0x2c241c24, 0x2c3c143c, 0x2c0c040c, 0x2c243c24, 0x2c042c34, 0x2c14240c, +0x2c2c141c, 0x2c040c34, 0x2c1c0404, 0x2c2c341c, 0x2c04242c, 0x2c1c1c04, 0x2c340c14, 0x340c042c, +0x34243404, 0x34342c14, 0x340c1c24, 0x34240c3c, 0x343c040c, 0x34143424, 0x342c2c34, 0x34041c0c, +0x3414141c, 0x342c0434, 0x34043c04, 0x341c2c1c, 0x3434242c, 0x3c041404, 0x3c1c0c14, 0x3c34042c, +0x3c0c3404, 0x3c242414, 0x3c3c1c24, 0x040c0c3c, 0x0424040c, 0x04043424, 0x04142c34, 0x042c1c0c, +0x0404141c, 0x04140434, 0x042c3c04, 0x04042c1c, 0x041c1c2c, 0x04341404, 0x040c0414, 0x041c3c2c, +0x04342c04, 0x040c2414, 0x04241424, 0x043c0c3c, 0x0414040c, 0x042c3424, 0x04042434, 0x04141c0c, +0x0c2c0c1c, 0x0c040434, 0x0c1c3404, 0x0c342c1c, 0x0c041c2c, 0x0c1c1404, 0x0c340414, 0x0c0c3c2c, +0x0c242c04, 0x0c3c2414, 0x0c0c1424, 0x0c24043c, 0x0c043c0c, 0x14142c24, 0x142c2434, 0x1404140c, +0x14140c1c, 0x142c0434, 0x14043404, 0x141c241c, 0x14341c2c, 0x140c0c04, 0x141c0414, 0x1434342c, +0x140c2c04, 0x14241c14, 0x143c1424, 0x1c14043c, 0x1c243c0c, 0x1c042c24, 0x1c142434, 0x1c2c140c, +0x1c040c1c, 0x1c1c3c34, 0x1c342c04, 0x1c04241c, 0x1c1c142c, 0x1c340c04, 0x1c0c0414, 0x1c24342c, +0x1c3c2404, 0x240c1c14, 0x24240c24, 0x2404043c, 0x2414340c, 0x242c2c24, 0x24041c34, 0x2414140c, +0x242c041c, 0x24043c34, 0x241c2c04, 0x2434241c, 0x240c142c, 0x241c0c04, 0x2c340414, 0x2c0c342c, }; #define NGRID_IQ2XXS 512 @@ -4121,10 +4121,19 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y // ====================== 3.3125 bpw (de)-quantization +//#define IQ3S_MULTIPLIER 2469109 +//#define IQ3S_MULTIPLIER 746226 +//#define IQ3S_MULTIPLIER 717154 +#define IQ3S_MULTIPLIER 677595 +#define IQ3S_BITS 3 + void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; + uint32_t aux32[2]; + const int8_t * grid = (const int8_t *)aux32; + for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -4133,25 +4142,25 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in const uint8_t * signs = x[i].signs; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; - const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; + const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf)); + const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4)); for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); - for (int j = 0; j < 4; ++j) { - y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + aux32[0] = ((qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + aux32[1] = ((qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + //y[j] = db1 * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f); + y[j] = db1 * (2*(((grid[j]+1)/2) & 7) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f); } y += 8; } qs += 8; signs += 4; for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); - for (int j = 0; j < 4; ++j) { - y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + aux32[0] = ((qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + aux32[1] = ((qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + //y[j] = db2 * (2*((grid[j]-1)/2) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f); + y[j] = db2 * (2*(((grid[j]+1)/2) & 7) + 1) * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f); } y += 8; } @@ -4159,6 +4168,34 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in qs += 8; signs += 4; } + + //for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + // const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; + // const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; + // for (int l = 0; l < 4; ++l) { + // const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + // const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + // for (int j = 0; j < 4; ++j) { + // y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + // y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + // } + // y += 8; + // } + // qs += 8; + // signs += 4; + // for (int l = 0; l < 4; ++l) { + // const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); + // const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); + // for (int j = 0; j < 4; ++j) { + // y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + // y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + // } + // y += 8; + // } + // qh += 2; + // qs += 8; + // signs += 4; + //} } } @@ -11311,6 +11348,112 @@ static int iq3_compare_func(const void * left, const void * right) { return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; } +static void iq3xs_init_grid512(void) { + + const int kmap_size = 1 << IQ3S_BITS*4; + const int grid_size = 512; + const int nwant = 3; + const int gindex = iq3_data_index(512); + const uint8_t kmask = (1 << IQ3S_BITS) - 1; + + uint32_t * kgrid_q3xs; + int * kmap_q3xs; + uint16_t * kneighbors_q3xs; + + printf("================================================================= %s(grid_size = %d, map_size = %d)\n", + __func__, grid_size, kmap_size); + uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t)); + uint32_t aux32; + const uint8_t * q4 = (const uint8_t *)&aux32; + for (int k = 0; k < grid_size; ++k) { + int8_t * pos = (int8_t *)(the_grid + k); + aux32 = (IQ3S_MULTIPLIER * k) & 0x0f0f0f0f; + for (int i = 0; i < 4; ++i) { + //pos[i] = 2*((q4[i]-1)/2) + 1; + pos[i] = 2*(((q4[i]+1)/2) & kmask) + 1; + } + } + + kgrid_q3xs = the_grid; + iq3_data[gindex].grid = the_grid; + kmap_q3xs = (int *)malloc(kmap_size*sizeof(int)); + iq3_data[gindex].map = kmap_q3xs; + for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1; + for (int i = 0; i < grid_size; ++i) { + aux32 = kgrid_q3xs[i]; + uint16_t index = 0; + for (int k=0; k<4; ++k) { + uint16_t q = (q4[k] - 1)/2; + index |= (q << IQ3S_BITS*k); + } + kmap_q3xs[index] = i; + } + int8_t pos[4]; + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + int num_neighbors = 0, num_not_in_map = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + ++num_not_in_map; + for (int k = 0; k < 4; ++k) { + int l = (i >> IQ3S_BITS*k) & kmask; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + num_neighbors += n; + } + printf("%s: %d neighbours in total\n", __func__, num_neighbors); + kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); + iq3_data[gindex].neighbours = kneighbors_q3xs; + int counter = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + for (int k = 0; k < 4; ++k) { + int l = (i >> IQ3S_BITS*k) & kmask; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + kmap_q3xs[i] = -(counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q3xs[counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q3xs[counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); +} + void iq3xs_init_impl(int grid_size) { const int gindex = iq3_data_index(grid_size); if (iq3_data[gindex].grid) { @@ -11334,44 +11477,49 @@ void iq3xs_init_impl(int grid_size) { 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610, 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992, }; - static const uint16_t kgrid_512[512] = { - 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34, - 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77, - 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142, - 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210, - 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288, - 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393, - 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514, - 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576, - 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653, - 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727, - 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833, - 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977, - 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047, - 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103, - 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199, - 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296, - 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415, - 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561, - 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648, - 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761, - 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877, - 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068, - 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177, - 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269, - 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520, - 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634, - 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805, - 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083, - 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276, - 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591, - 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729, - 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032, - }; + if (grid_size == 512) { + iq3xs_init_grid512(); + return; + } +// static const uint16_t kgrid_512[512] = { +// 0, 170, 356, 23, 137, 324, 54, 225, 411, 78, 192, 435, 109, 280, 466, 133, +// 312, 42, 164, 343, 9, 708, 886, 545, 723, 910, 576, 819, 933, 600, 778, 965, +// 688, 874, 1052, 1175, 1345, 1084, 1262, 1441, 1107, 1230, 1408, 1139, 1317, 1496, 1162, 1285, +//1584, 1698, 1884, 1551, 1729, 1908, 1582, 1753, 1939, 1606, 1848, 1963, 1637, 1808, 2506, 2181, +//2416, 2082, 2204, 2383, 2049, 2292, 2470, 2073, 2251, 2438, 2160, 2859, 3037, 2704, 2818, 2621, +//2728, 2914, 2580, 2767, 2881, 2612, 2790, 2969, 2635, 3334, 3504, 3179, 3357, 3536, 3202, 3445, +//3112, 3226, 3412, 3079, 3321, 3500, 3622, 3793, 3979, 3654, 3888, 4067, 3677, 264, 2, 181, +// 360, 26, 212, 327, 57, 236, 414, 81, 259, 446, 104, 291, 469, 136, 322, 53, +// 160, 346, 524, 711, 945, 556, 734, 913, 579, 830, 1000, 611, 789, 512, 698, 1389, +//1056, 1170, 1356, 1031, 1265, 1444, 1118, 1289, 1411, 1142, 1320, 1499, 1173, 1856, 1594, 1709, +//1888, 1554, 1740, 1983, 1577, 1764, 1942, 1609, 1795, 2038, 2144, 2331, 2061, 2176, 2418, 2093, +//2200, 2386, 2052, 2303, 2473, 2148, 2262, 2441, 2627, 2870, 3040, 2707, 2893, 2560, 2738, 2917, +//2584, 2762, 2948, 2615, 2793, 2972, 3158, 3329, 3579, 3182, 3360, 3091, 3213, 3392, 3122, 3237, +//3416, 3082, 3268, 4023, 3681, 3804, 3982, 3649, 3891, 4078, 152, 275, 5, 184, 362, 37, +// 208, 394, 4, 247, 417, 92, 270, 449, 115, 294, 24, 139, 325, 48, 682, 861, +// 528, 706, 956, 623, 737, 916, 590, 769, 1011, 678, 792, 523, 1157, 1392, 1066, 1245, +//1360, 1026, 1268, 1455, 1113, 1300, 1478, 1145, 1323, 1574, 1680, 1867, 1541, 1712, 1890, 1565, +//1736, 1922, 1652, 1775, 1945, 1620, 1798, 2553, 2219, 2334, 2064, 2179, 2429, 2088, 2274, 2389, +//2056, 2242, 2484, 2151, 2329, 2956, 2630, 2865, 3051, 2718, 2896, 2563, 2749, 2920, 2594, 2773, +//2944, 2682, 3308, 3495, 3153, 3340, 3526, 3249, 3363, 3102, 3208, 3395, 3125, 3304, 3418, 3093, +//3776, 4026, 3692, 3879, 3985, 3660, 3902, 489, 163, 342, 8, 131, 373, 32, 218, 397, +// 64, 242, 428, 95, 273, 452, 190, 297, 35, 150, 328, 515, 757, 864, 530, 717, +// 896, 626, 804, 927, 585, 772, 1014, 681, 859, 1046, 1152, 1403, 1069, 1248, 1426, 1037, +//1216, 1458, 1124, 1311, 1481, 1156, 1846, 1569, 1691, 1870, 1536, 1779, 1901, 1560, 1746, 1925, +//1656, 1834, 1956, 1623, 2313, 2500, 2230, 2401, 2075, 2190, 2368, 2099, 2277, 2456, 2058, 2245, +//2480, 2666, 2844, 3031, 2625, 2876, 2606, 2721, 2899, 2574, 2752, 2931, 2597, 2776, 2954, 3141, +//3376, 3498, 3164, 3343, 3521, 3252, 3438, 3097, 3219, 3398, 3128, 3307, 3493, 3600, 3786, 3973, +//3696, 3874, 4060, 79, 257, 52, 174, 345, 19, 134, 376, 43, 221, 400, 66, 253, +// 424, 98, 276, 463, 129, 372, 38, 153, 843, 518, 752, 939, 541, 720, 898, 637, +// 808, 994, 596, 775, 569, 1196, 1382, 1041, 1163, 1350, 1072, 1251, 1437, 1096, 1218, 1461, +//1128, 1306, 1492, 1671, 1849, 1580, 1702, 1873, 1547, 1790, 1960, 1571, 1749, 1928, 1602, 1845, +//2016, 2138, 2316, 2055, 2225, 2412, 2078, 2193, 2371, 2110, 2280, 2467, 2133, 2248, 2946, 2677, +// }; const int kmap_size = 4096; const int nwant = grid_size == 256 ? 2 : 3; - const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + //const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + const uint16_t * kgrid = kgrid_256; uint32_t * kgrid_q3xs; int * kmap_q3xs; uint16_t * kneighbors_q3xs; @@ -11773,6 +11921,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo for (int ibl = 0; ibl < nbl; ++ibl) { + //float block_mse = 0; + memset(&y[ibl], 0, sizeof(block_iq3_s)); y[ibl].d = GGML_FP32_TO_FP16(0.f); @@ -11823,7 +11973,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l)); } uint16_t u = 0; - for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i); + for (int i = 0; i < 4; ++i) u |= Laux[4*k+i] << IQ3S_BITS*i; int grid_index = kmap_q3xs[u]; is_on_grid_aux[k] = true; if (grid_index < 0) { @@ -11855,7 +12005,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo for (int i = 0; i < 4; ++i) { int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); l = MAX(0, MIN(kMaxQ-1, l)); - u |= (l << 3*i); + u |= l << IQ3S_BITS*i; } int grid_index = kmap_q3xs[u]; if (grid_index < 0) { @@ -11882,7 +12032,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } for (int k = 0; k < bs4; ++k) { uint16_t u = 0; - for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i); + for (int i = 0; i < 4; ++i) u |= L[4*k+i] << IQ3S_BITS*i; int grid_index = kmap_q3xs[u]; if (grid_index < 0) { printf("Oops: found point %u not on grid:", u); @@ -11899,14 +12049,25 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo GGML_ASSERT(scale >= 0); scales[ib] = scale; max_scale = MAX(max_scale, scale); + + //for (int k = 0; k < bs8; ++k) { + // for (int i = 0; i < 8; ++i) { + // float diff = scale*(2*L[8*k+i] + 1) * (block_signs[k] & (1 << i) ? -1 : 1) - xb[8*k+i]; + // block_mse += diff*diff; + // } + //} + } + //printf("Block %d: rmse = %g\n", ibl, (double)sqrtf(block_mse/QK_K)); + if (!max_scale) { continue; } float d = max_scale/31; - y[ibl].d = GGML_FP32_TO_FP16(d); + //y[ibl].d = GGML_FP32_TO_FP16(d * 1.025f); //1.02f); //1.0125f); + y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f); //1.04f); //1.02f); //1.0125f); float id = 1/d; for (int ib = 0; ib < QK_K/block_size; ib += 2) { int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));