diff --git a/ggml-metal.m b/ggml-metal.m index b438b83f9..3e3be98c5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -925,9 +925,7 @@ void ggml_metal_graph_compute( nth1 = 1; if (ne11 * ne12 < 4) { [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; - //} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - } else if (false) { - // TODO: with ggml_mul_mat_pad this kernel no longer seems to be needed + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4]; nrows = ne11; } else { diff --git a/llama.cpp b/llama.cpp index 907d130f9..2437d0fcf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -438,50 +438,6 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * ggml_graph_compute(graph, &plan); } -//// EXPERIMENTAL: -//// -//// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad" -//// the idea is to represent the original matrix multiplication: -//// -//// Z = X @ Y -//// -//// with the sum of two matrix multiplications: -//// -//// Z = (X_0 @ Y_0) + (X_1 @ Y_1) -//// -//// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad" -//// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more -//// general-purpose kernels -//// -//static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) { -////#if !defined(GGML_USE_METAL) -//// return ggml_mul_mat(ctx, x, y); -////#endif -// -// // use padding only if dimension 0 is at least 8 times larger than the padding -// // else we won't get much benefit from the optimization -// const int n_pad_req = 8; -// -// if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) { -// return ggml_mul_mat(ctx, x, y); -// } -// -// struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0); -// struct ggml_tensor * x_1 = ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]); -// -// struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0); -// struct ggml_tensor * y_1 = ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]); -// -// return ggml_add(ctx, -// ggml_mul_mat(ctx, x_0, y_0), -// ggml_mul_mat(ctx, x_1, y_1)); -//} -// -//// TODO: check if other backends benefit from this and enable for all -//#if defined(GGML_USE_METAL) -////#define ggml_mul_mat ggml_mul_mat_pad -//#endif - // // llama helpers //