diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3ad0d305d..16d17f801 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6533,6 +6533,8 @@ inline void ggml_cuda_op_mul_mat_vec_q( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const cudaStream_t & stream) { + GGML_ASSERT(ggml_nrows(src1) == 1); + const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; @@ -7753,11 +7755,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { - if (ggml_nrows(src1) == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; #endif // GGML_CUDA_FORCE_DMMV if (use_mul_mat_vec_q) {