From 6d15da1ec08542af9c71599a1d1bff2066786b47 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 5 Mar 2024 08:36:57 +0200
Subject: [PATCH] iq3_s_mult_shuffle: use new multiplier and cleanup

---
 ggml-cuda.cu     | 49 +-----------------------------------------------
 ggml-metal.metal |  2 +-
 ggml-quants.c    |  3 +--
 3 files changed, 3 insertions(+), 51 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 6f8c4a3ac..373f03a23 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2371,17 +2371,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 
 }
 
-#ifdef IQ3S_SLOW_MULT
-// Better (lower PPL), but requires more bit twidling, so slower
-#define IQ3S_MULTIPLIER 190842953LL
-#else
-#define IQ3S_MULTIPLIER 72968561ULL
-//#define IQ3S_MULTIPLIER 540201
-//#define IQ3S_MULTIPLIER 1378231
-//#define IQ3S_MULTIPLIER 898886
-//#define IQ3S_MULTIPLIER 842866
+#define IQ3S_MULTIPLIER 518559
 static const __device__ uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 3, 5, 5, 5, 7, 7, 9, 9, 11, 13, 15};
-#endif
 
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@@ -2400,40 +2391,11 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
     const int is = (32*ib + 8*il)/IQ3S_BLOCK_SIZE;
     const float d = (float)x[i].d * (1 + 2*((x[i].scales[is/2] >> 4*(is%2)) & 0xf));
     const uint8_t signs = x[i].signs[4*ib + il];
-#ifdef IQ3S_SLOW_MULT
-    aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-    aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-#else
-    //aux32[0] = (((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
-    //aux32[1] = (((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
     aux32[0] = (((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f);
     aux32[1] = (((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f);
-#endif
-//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-//#ifdef IQ3S_SLOW_MULT
-//    aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
-//    aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
-//#endif
-//    uint32_t signs0 = __vcmpeq4(((signs & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-//    uint32_t signs1 = __vcmpeq4(((signs >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-//    aux32[0] = __vsub4(aux32[0] ^ signs0, signs0);
-//    aux32[1] = __vsub4(aux32[1] ^ signs1, signs1);
-//    for (int j = 0; j < 8; ++j) {
-//        y[j] = d * grid[j];
-//    }
-//#else
-#ifdef IQ3S_SLOW_MULT
     for (int j = 0; j < 8; ++j) {
-        y[j] = d * (2*((grid[j]-1)/2) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-    }
-#else
-    //static const uint8_t k_values[16] = {1, 1, 1, 3, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 15};
-    for (int j = 0; j < 8; ++j) {
-        //y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
         y[j] = d * iq3s_values[grid[j]] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
     }
-#endif
-//#endif
 #else
     assert(false);
 #endif
@@ -5251,18 +5213,9 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
     int sumi[2] = {0, 0};
 #endif
     for (int l = 0; l < 4; ++l) {
-#ifdef IQ3S_SLOW_MULT
-        aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-        aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-        aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
-        aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
-#else
-        //aux32[0] = (((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
-        //aux32[1] = (((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
         aux32[0] = (((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f);
         aux32[1] = (((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f);
         for (int j = 0; j < 8; ++j) aux8[j] = iq3s_values[aux8[j]];
-#endif
         uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
         uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
         const int grid_l = __vsub4(aux32[0] ^ signs0, signs0);
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 176287fcd..8c3ac9e34 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -2546,7 +2546,7 @@ typedef struct {
     uint8_t signs[QK_K/8];
     uint8_t scales[IQ3S_N_SCALE];
 } block_iq3_s;
-#define IQ3S_MULTIPLIER 72968561ULL
+#define IQ3S_MULTIPLIER 518559
 constexpr constant static uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 3, 5, 5, 5, 7, 7, 9, 9, 11, 13, 15};
 
 typedef struct {
diff --git a/ggml-quants.c b/ggml-quants.c
index ef21bb487..c83b7c775 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -4054,8 +4054,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
 
 // ====================== 3.3125 bpw (de)-quantization
 
-#define IQ3S_MULTIPLIER 72968561ULL
-
+#define IQ3S_MULTIPLIER 518559
 #define IQ3S_BITS 3
 
 static const uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 3, 5, 5, 5, 7, 7, 9, 9, 11, 13, 15};