From 7ea36953ba278484c0aa5f5e6df210ce6a24aad0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Dec 2023 12:45:15 +0200
Subject: [PATCH] llama : first working version

---
 ggml.c    | 9 +++++++--
 ggml.h    | 2 +-
 llama.cpp | 8 ++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 40c389661..322d0c850 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4105,7 +4105,9 @@ struct ggml_tensor * ggml_mul_mat_id(
     result->src[0] = ids;
     result->src[1] = b;
 
-    for (int64_t i = 0; i < n_as; i++) {
+    // TODO: n_as is the selected experts, but it should be the total number of experts
+    //for (int64_t i = 0; i < n_as; i++) {
+    for (int64_t i = 0; i < 8; i++) {
         struct ggml_tensor * a = as[i];
         GGML_ASSERT(ggml_are_same_shape(as[0], a));
         GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -9758,7 +9760,10 @@ static void ggml_compute_forward_mul_mat_id(
 
     for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
         const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
-        GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
+
+        // TODO: this assert seems wrong?
+        //printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id);
+        //GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
 
         const struct ggml_tensor * src0_row = dst->src[row_id + 2];
         ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
diff --git a/ggml.h b/ggml.h
index a8f10cbd5..e0cb3b99b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -217,7 +217,7 @@
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         1024
 #define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            6
+#define GGML_MAX_SRC            10
 #define GGML_MAX_NAME           64
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
diff --git a/llama.cpp b/llama.cpp
index 3320c781f..6333af4aa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4242,14 +4242,18 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                const int n_experts_per_tok = 2; // TODO: param
+                // TODO: param
+                const int n_experts = 8;
+                const int n_experts_per_tok = 2;
 
                 ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
                 ggml_tensor * probs = ggml_soft_max(ctx0, logits);                             // [n_tokens, num_experts]
 
                 // select experts
                 ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
-                ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts);        // [n_tokens, num_experts_per_tok, 1]
+                //ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts);        // [n_tokens, num_experts_per_tok, 1]
+                ggml_tensor * weights = ggml_get_rows(ctx0,
+                        ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts);
                 weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights));             // [n_tokens, num_experts_per_tok, 1]
 
                 // compute expert outputs