From e7e7b114556172d8dae0bbd9f13887a63949a8f5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Sep 2023 22:52:01 +0300
Subject: [PATCH] llama : remove experimental stuff

---
 ggml-metal.m |  4 +---
 llama.cpp    | 44 --------------------------------------------
 2 files changed, 1 insertion(+), 47 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index b438b83f9..3e3be98c5 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -925,9 +925,7 @@ void ggml_metal_graph_compute(
                                             nth1 = 1;
                                             if (ne11 * ne12 < 4) {
                                                 [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
-                                            //} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                            } else if (false) {
-                                                // TODO: with ggml_mul_mat_pad this kernel no longer seems to be needed
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
                                                 [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
                                                 nrows = ne11;
                                             } else {
diff --git a/llama.cpp b/llama.cpp
index 907d130f9..2437d0fcf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -438,50 +438,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
     ggml_graph_compute(graph, &plan);
 }
 
-//// EXPERIMENTAL:
-////
-//// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
-//// the idea is to represent the original matrix multiplication:
-////
-////   Z = X @ Y
-////
-//// with the sum of two matrix multiplications:
-////
-////   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
-////
-//// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
-//// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
-//// general-purpose kernels
-////
-//static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
-////#if !defined(GGML_USE_METAL)
-////    return ggml_mul_mat(ctx, x, y);
-////#endif
-//
-//    // use padding only if dimension 0 is at least 8 times larger than the padding
-//    // else we won't get much benefit from the optimization
-//    const int n_pad_req = 8;
-//
-//    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
-//        return ggml_mul_mat(ctx, x, y);
-//    }
-//
-//    struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
-//    struct ggml_tensor * x_1 = ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
-//
-//    struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
-//    struct ggml_tensor * y_1 = ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
-//
-//    return ggml_add(ctx,
-//            ggml_mul_mat(ctx, x_0, y_0),
-//            ggml_mul_mat(ctx, x_1, y_1));
-//}
-//
-//// TODO: check if other backends benefit from this and enable for all
-//#if defined(GGML_USE_METAL)
-////#define ggml_mul_mat ggml_mul_mat_pad
-//#endif
-
 //
 // llama helpers
 //