ggml: align structures for 64bit, reorder params and ignore error-warn for Clang 19

2025-02-04 15:43:53 +01:00 · 2025-01-18 19:33:02 +03:00 · 2025-01-18 19:33:02 +03:00 · 9a2380ec32
commit 9a2380ec32
parent a1649cc13f
7 changed files with 37 additions and 31 deletions
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 // graph allocator

 struct hash_node {
+    size_t offset; // offset within the buffer
    int n_children;
    int n_views;
    int buffer_id;
-    size_t offset; // offset within the buffer
    bool allocated;
 };

--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -1287,8 +1287,8 @@ typedef pthread_mutex_t    ggml_mutex_t;

 // Threadpool def
 struct ggml_threadpool {
-    ggml_mutex_t mutex;       // mutex for cond.var
    ggml_cond_t  cond;        // cond.var for waiting for new work
+    ggml_mutex_t mutex;       // mutex for cond.var

    struct ggml_cgraph * cgraph;
    struct ggml_cplan  * cplan;
@ -1299,19 +1299,19 @@ struct ggml_threadpool {
    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.

-    // these are atomic as an annotation for thread-sanitizer
-    atomic_bool stop;         // Used for stopping the threadpool altogether
-    atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_bool abort;        // Used for aborting processing of a graph
-
    struct ggml_compute_state * workers;   // per thread state
-    int          n_threads_max; // number of threads in the pool
    atomic_int   n_threads_cur; // number of threads used in the current graph
+    int          n_threads_max; // number of threads in the pool

    int32_t      prio;        // Scheduling priority
    uint32_t     poll;        // Polling level (0 - no polling)

    enum ggml_status ec;
+
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
+    atomic_bool abort;        // Used for aborting processing of a graph
 };

 // Per-thread state
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order {
 };

 struct ggml_cgraph {
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+
    int size;    // maximum number of nodes/leafs/grads/grad_accs
    int n_nodes; // number of nodes currently in use
    int n_leafs; // number of leafs currently in use
@ -293,10 +297,6 @@ struct ggml_cgraph {
    struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
    struct ggml_tensor ** grad_accs; // accumulators for node gradients
    struct ggml_tensor ** leafs;     // tensors with constant data
-
-    struct ggml_hash_set visited_hash_set;
-
-    enum ggml_cgraph_eval_order order;
 };

 // returns a slice of cgraph with nodes [i0, i1)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
    assert(obj_size == (size_t)((char *)p - (char *)cgraph));

    *cgraph = (struct ggml_cgraph) {
+        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
        /*.size         =*/ size,
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
        /*.grads        =*/ grads_ptr,
        /*.grad_accs    =*/ grad_accs_ptr,
        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
    };

    ggml_hash_set_reset(&cgraph->visited_hash_set);
@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
    struct ggml_cgraph cgraph = {
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
        /*.size             =*/ 0,
        /*.n_nodes          =*/ i1 - i0,
        /*.n_leafs          =*/ 0,
@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
        /*.grads            =*/ NULL, // gradients would need visited_hash_set
        /*.grad_accs        =*/ NULL,
        /*.leafs            =*/ NULL,
-        /*.visited_hash_set =*/ { 0, NULL, NULL },
-        /*.order            =*/ cgraph0->order,
    };

    return cgraph;
--- a/include/llama.h
+++ b/include/llama.h
@ -285,6 +285,12 @@ extern "C" {
        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
        int32_t main_gpu;

+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
+
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;

@ -298,12 +304,6 @@ extern "C" {

        // override key-value pairs of the model meta data
        const struct llama_model_kv_override * kv_overrides;
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;      // use mmap if possible
-        bool use_mlock;     // force system to keep model in RAM
-        bool check_tensors; // validate model tensor data
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() {
        /*.n_gpu_layers                =*/ 0,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ nullptr,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
+        /*.tensor_split                =*/ nullptr,
+        /*.progress_callback           =*/ nullptr,
+        /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
    };

 #ifdef GGML_USE_METAL
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -17,6 +17,12 @@
 #include <set>
 #include <unordered_map>

+// disable C++11 deprecation warning non-constant-expression cannot be narrowed
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wc++11-narrowing"
+#endif
+
 //
 // helpers
 //
@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session {
        }

        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
        // at the beginning tokenization score is zero
-        tokenization_results[0] = { vocab.token_unk(), 0, 0 };
+        tokenization_results[0] = { 0, vocab.token_unk(), 0 };

        for (size_t input_offset = 0; input_offset < input_len;) {
            size_t prefix_offset = input_offset;
@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session {
                    const double challenger_score = current_best.score_sum + token_score;
                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                    if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
                        current_champ = challenger;
                    }
                }
@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session {
                prefix_offset = input_offset + n_utf8_code_units;
                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
                    current_champ = challenger;
                }
            }
@ -973,8 +979,8 @@ private:

    // this structure stores the best tokenization so far at input_offset
    struct best_tokenization {
-        llama_token token_id;
        size_t input_offset;
+        llama_token token_id;
        float score_sum;
    };