mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 20:43:07 +01:00
ggml-cuda : remove some superfluous comments for TQ2_0 tile loading
This commit is contained in:
parent
983aa09b5c
commit
f5fddb6d24
@ -1848,13 +1848,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||||||
#ifdef INT8_MMA_AVAILABLE
|
#ifdef INT8_MMA_AVAILABLE
|
||||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k] = q;
|
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k] = q;
|
||||||
#else
|
#else
|
||||||
// NOTE: this might assume WARP_SIZE is >= 32
|
|
||||||
x_qs[i*(2*WARP_SIZE + 1) + k] = q;
|
x_qs[i*(2*WARP_SIZE + 1) + k] = q;
|
||||||
#endif // INT8_MMA_AVAILABLE
|
#endif // INT8_MMA_AVAILABLE
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: does this work with WARP_SIZE != 32?
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_0/2)) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_0/2)) {
|
||||||
int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_0) + threadIdx.x/(QI2_0/2);
|
int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_0) + threadIdx.x/(QI2_0/2);
|
||||||
|
Loading…
Reference in New Issue
Block a user