cuda : use mmv kernel for quantum cache ops

This commit is contained in:
Georgi Gerganov 2023-12-04 15:41:20 +02:00
parent a1bf6c09f8
commit b881f630ca
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -6533,6 +6533,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t src1_padded_row_size, const cudaStream_t & stream) {
GGML_ASSERT(ggml_nrows(src1) == 1);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
@ -7753,11 +7755,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
} else if (src0->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32) {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
if (ggml_nrows(src1) == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
#ifdef GGML_CUDA_FORCE_DMMV #ifdef GGML_CUDA_FORCE_DMMV
const bool use_mul_mat_vec_q = false; const bool use_mul_mat_vec_q = false;
#else #else
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
#endif // GGML_CUDA_FORCE_DMMV #endif // GGML_CUDA_FORCE_DMMV
if (use_mul_mat_vec_q) { if (use_mul_mat_vec_q) {