cuda : use mmv kernel for quantum cache ops

2025-02-02 06:52:46 +01:00 · 2023-12-04 15:41:20 +02:00 · 2023-12-04 15:41:20 +02:00 · b881f630ca
commit b881f630ca
parent a1bf6c09f8
1 changed files with 4 additions and 2 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6533,6 +6533,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, const cudaStream_t & stream) {

+    GGML_ASSERT(ggml_nrows(src1) == 1);
+
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

@ -7753,11 +7755,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    } else if (src0->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (ggml_nrows(src1) == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
 #endif // GGML_CUDA_FORCE_DMMV

            if (use_mul_mat_vec_q) {