From 90c12e6b3cebfa7fec9ab2bb239cf509d0b828a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 20:05:58 +0200 Subject: [PATCH] ggml : do not use BLAS with ggml_mul_mat_id --- ggml.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 6f5493096..4dbacbb49 100644 --- a/ggml.c +++ b/ggml.c @@ -9508,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) + // all the experts for each batch element and the processing would become incredibly slow // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && + if (dst->op != GGML_OP_MUL_MAT_ID && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && //src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&