From 946796fcec1b6173ec02893d4f7839d8e70276ce Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 11 Jan 2025 19:48:08 -0500 Subject: [PATCH] ggml-cuda : slight optimizations for TQ2_0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/convert.cu | 2 +- ggml/src/ggml-cuda/mmq.cuh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c19203da3..2f067ad1b 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -287,7 +287,7 @@ static __global__ void dequantize_block_tq2_0(const void * __restrict__ vx, dst_ const int64_t n = tid/32; // 0 or 1 const int64_t l = tid - 32*n; // 0..32 - const uint8_t q = x[i].qs[32*n + l]; + const uint8_t q = x[i].qs[tid]; dst_t * y = yy + i*QK_K + 128*n; float d = __half2float(x[i].d); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 8d1370184..ab29cc927 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1837,7 +1837,9 @@ template static __device__ __forceinlin const int qs0 = get_int_b2(bxi->qs, kqsx); #pragma unroll - for (int l = 0; l < QR2_0; ++l) { + for (int l0 = 0; l0 < QR2_0; ++l0) { + const int l = (l0 + kqsx/8) % QR2_0; // avoid shared memory bank conflicts + // 0..7, 32..39 // 8..15, 40..47 // 16..23, 48..55