From 946796fcec1b6173ec02893d4f7839d8e70276ce Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 11 Jan 2025 19:48:08 -0500
Subject: [PATCH] ggml-cuda : slight optimizations for TQ2_0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/convert.cu | 2 +-
 ggml/src/ggml-cuda/mmq.cuh    | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index c19203da3..2f067ad1b 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -287,7 +287,7 @@ static __global__ void dequantize_block_tq2_0(const void * __restrict__ vx, dst_
     const int64_t n   = tid/32;      // 0 or 1
     const int64_t l   = tid - 32*n;  // 0..32
 
-    const uint8_t q = x[i].qs[32*n + l];
+    const uint8_t q = x[i].qs[tid];
     dst_t * y = yy + i*QK_K + 128*n;
 
     float d = __half2float(x[i].d);
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 8d1370184..ab29cc927 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -1837,7 +1837,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const int qs0 = get_int_b2(bxi->qs, kqsx);
 
 #pragma unroll
-        for (int l = 0; l < QR2_0; ++l) {
+        for (int l0 = 0; l0 < QR2_0; ++l0) {
+            const int l = (l0 + kqsx/8) % QR2_0; // avoid shared memory bank conflicts
+
             //  0..7,  32..39
             //  8..15, 40..47
             // 16..23, 48..55