From 9a2380ec32cc886eef1acf4a5bf7bef2d0a1d3c6 Mon Sep 17 00:00:00 2001
From: Herman Semenov <GermanAizek@yandex.ru>
Date: Sat, 18 Jan 2025 19:33:02 +0300
Subject: [PATCH]  ggml: align structures for 64bit, reorder params and ignore
 error-warn for Clang 19

---
 ggml/src/ggml-alloc.c        |  2 +-
 ggml/src/ggml-cpu/ggml-cpu.c | 14 +++++++-------
 ggml/src/ggml-impl.h         |  8 ++++----
 ggml/src/ggml.c              |  8 ++++----
 include/llama.h              | 12 ++++++------
 src/llama-model.cpp          |  8 ++++----
 src/llama-vocab.cpp          | 16 +++++++++++-----
 7 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 9a3bf9f29..9aff00376 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 // graph allocator
 
 struct hash_node {
+    size_t offset; // offset within the buffer
     int n_children;
     int n_views;
     int buffer_id;
-    size_t offset; // offset within the buffer
     bool allocated;
 };
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 0ed92b3ff..079c053ba 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1287,8 +1287,8 @@ typedef pthread_mutex_t    ggml_mutex_t;
 
 // Threadpool def
 struct ggml_threadpool {
-    ggml_mutex_t mutex;       // mutex for cond.var
     ggml_cond_t  cond;        // cond.var for waiting for new work
+    ggml_mutex_t mutex;       // mutex for cond.var
 
     struct ggml_cgraph * cgraph;
     struct ggml_cplan  * cplan;
@@ -1299,19 +1299,19 @@ struct ggml_threadpool {
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
-    // these are atomic as an annotation for thread-sanitizer
-    atomic_bool stop;         // Used for stopping the threadpool altogether
-    atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_bool abort;        // Used for aborting processing of a graph
-
     struct ggml_compute_state * workers;   // per thread state
-    int          n_threads_max; // number of threads in the pool
     atomic_int   n_threads_cur; // number of threads used in the current graph
+    int          n_threads_max; // number of threads in the pool
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
 
     enum ggml_status ec;
+
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
+    atomic_bool abort;        // Used for aborting processing of a graph
 };
 
 // Per-thread state
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index eab017889..678389763 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order {
 };
 
 struct ggml_cgraph {
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+
     int size;    // maximum number of nodes/leafs/grads/grad_accs
     int n_nodes; // number of nodes currently in use
     int n_leafs; // number of leafs currently in use
@@ -293,10 +297,6 @@ struct ggml_cgraph {
     struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
     struct ggml_tensor ** grad_accs; // accumulators for node gradients
     struct ggml_tensor ** leafs;     // tensors with constant data
-
-    struct ggml_hash_set visited_hash_set;
-
-    enum ggml_cgraph_eval_order order;
 };
 
 // returns a slice of cgraph with nodes [i0, i1)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b1d0d4913..18349a298 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
     assert(obj_size == (size_t)((char *)p - (char *)cgraph));
 
     *cgraph = (struct ggml_cgraph) {
+        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.size         =*/ size,
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
@@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.grads        =*/ grads_ptr,
         /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
 
     ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
     struct ggml_cgraph cgraph = {
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
         /*.size             =*/ 0,
         /*.n_nodes          =*/ i1 - i0,
         /*.n_leafs          =*/ 0,
@@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.grads            =*/ NULL, // gradients would need visited_hash_set
         /*.grad_accs        =*/ NULL,
         /*.leafs            =*/ NULL,
-        /*.visited_hash_set =*/ { 0, NULL, NULL },
-        /*.order            =*/ cgraph0->order,
     };
 
     return cgraph;
diff --git a/include/llama.h b/include/llama.h
index 298b8d1bc..8c6657a5e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -285,6 +285,12 @@ extern "C" {
         // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         int32_t main_gpu;
 
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
+
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
 
@@ -298,12 +304,6 @@ extern "C" {
 
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;      // use mmap if possible
-        bool use_mlock;     // force system to keep model in RAM
-        bool check_tensors; // validate model tensor data
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index c2d23a8d3..cc307d3a9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() {
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ nullptr,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.check_tensors               =*/ false,
+        /*.tensor_split                =*/ nullptr,
+        /*.progress_callback           =*/ nullptr,
+        /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
     };
 
 #ifdef GGML_USE_METAL
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 9a680aed4..62b696bb7 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -17,6 +17,12 @@
 #include <set>
 #include <unordered_map>
 
+// disable C++11 deprecation warning non-constant-expression cannot be narrowed
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wc++11-narrowing"
+#endif
+
 //
 // helpers
 //
@@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session {
         }
 
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
         // at the beginning tokenization score is zero
-        tokenization_results[0] = { vocab.token_unk(), 0, 0 };
+        tokenization_results[0] = { 0, vocab.token_unk(), 0 };
 
         for (size_t input_offset = 0; input_offset < input_len;) {
             size_t prefix_offset = input_offset;
@@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session {
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
                         current_champ = challenger;
                     }
                 }
@@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session {
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
                     current_champ = challenger;
                 }
             }
@@ -973,8 +979,8 @@ private:
 
     // this structure stores the best tokenization so far at input_offset
     struct best_tokenization {
-        llama_token token_id;
         size_t input_offset;
+        llama_token token_id;
         float score_sum;
     };