cuda : fix get_rows when ncols is odd

This commit is contained in:
slaren 2023-12-10 13:12:11 +01:00
parent cefebb3660
commit 8614aa736d

View File

@ -1721,6 +1721,32 @@ static __global__ void k_get_rows(
dst_row[iybs + iqs + y_offset] = v.y; dst_row[iybs + iqs + y_offset] = v.y;
} }
template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * src0, const int32_t * src1, dst_t * dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
if (i00 >= ne00) {
return;
}
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00];
}
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@ -5083,6 +5109,8 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1);
GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>( k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
@ -5094,6 +5122,38 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg
(void) dst; (void) dst;
} }
template<typename src0_t>
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst);
const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
(void) dst;
}
template<float (*bin_op)(const float, const float)> template<float (*bin_op)(const float, const float)>
struct bin_bcast_cuda { struct bin_bcast_cuda {
template<typename src0_t, typename src1_t, typename dst_t> template<typename src0_t, typename src1_t, typename dst_t>
@ -6491,10 +6551,10 @@ static void ggml_cuda_op_get_rows(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
get_rows_cuda<1, 1, convert_f16>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
get_rows_cuda<1, 1, convert_f32>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break; break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);