mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-04 23:52:32 +01:00
ggml-cuda : slight optimizations for TQ2_0
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
f5fddb6d24
commit
946796fcec
@ -287,7 +287,7 @@ static __global__ void dequantize_block_tq2_0(const void * __restrict__ vx, dst_
|
|||||||
const int64_t n = tid/32; // 0 or 1
|
const int64_t n = tid/32; // 0 or 1
|
||||||
const int64_t l = tid - 32*n; // 0..32
|
const int64_t l = tid - 32*n; // 0..32
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[32*n + l];
|
const uint8_t q = x[i].qs[tid];
|
||||||
dst_t * y = yy + i*QK_K + 128*n;
|
dst_t * y = yy + i*QK_K + 128*n;
|
||||||
|
|
||||||
float d = __half2float(x[i].d);
|
float d = __half2float(x[i].d);
|
||||||
|
@ -1837,7 +1837,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||||||
const int qs0 = get_int_b2(bxi->qs, kqsx);
|
const int qs0 = get_int_b2(bxi->qs, kqsx);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int l = 0; l < QR2_0; ++l) {
|
for (int l0 = 0; l0 < QR2_0; ++l0) {
|
||||||
|
const int l = (l0 + kqsx/8) % QR2_0; // avoid shared memory bank conflicts
|
||||||
|
|
||||||
// 0..7, 32..39
|
// 0..7, 32..39
|
||||||
// 8..15, 40..47
|
// 8..15, 40..47
|
||||||
// 16..23, 48..55
|
// 16..23, 48..55
|
||||||
|
Loading…
Reference in New Issue
Block a user