From 9a2380ec32cc886eef1acf4a5bf7bef2d0a1d3c6 Mon Sep 17 00:00:00 2001 From: Herman Semenov Date: Sat, 18 Jan 2025 19:33:02 +0300 Subject: [PATCH] ggml: align structures for 64bit, reorder params and ignore error-warn for Clang 19 --- ggml/src/ggml-alloc.c | 2 +- ggml/src/ggml-cpu/ggml-cpu.c | 14 +++++++------- ggml/src/ggml-impl.h | 8 ++++---- ggml/src/ggml.c | 8 ++++---- include/llama.h | 12 ++++++------ src/llama-model.cpp | 8 ++++---- src/llama-vocab.cpp | 16 +++++++++++----- 7 files changed, 37 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 9a3bf9f29..9aff00376 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { // graph allocator struct hash_node { + size_t offset; // offset within the buffer int n_children; int n_views; int buffer_id; - size_t offset; // offset within the buffer bool allocated; }; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0ed92b3ff..079c053ba 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1287,8 +1287,8 @@ typedef pthread_mutex_t ggml_mutex_t; // Threadpool def struct ggml_threadpool { - ggml_mutex_t mutex; // mutex for cond.var ggml_cond_t cond; // cond.var for waiting for new work + ggml_mutex_t mutex; // mutex for cond.var struct ggml_cgraph * cgraph; struct ggml_cplan * cplan; @@ -1299,19 +1299,19 @@ struct ggml_threadpool { atomic_int GGML_CACHE_ALIGN n_barrier_passed; atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. - // these are atomic as an annotation for thread-sanitizer - atomic_bool stop; // Used for stopping the threadpool altogether - atomic_bool pause; // Used for pausing the threadpool or individual threads - atomic_bool abort; // Used for aborting processing of a graph - struct ggml_compute_state * workers; // per thread state - int n_threads_max; // number of threads in the pool atomic_int n_threads_cur; // number of threads used in the current graph + int n_threads_max; // number of threads in the pool int32_t prio; // Scheduling priority uint32_t poll; // Polling level (0 - no polling) enum ggml_status ec; + + // these are atomic as an annotation for thread-sanitizer + atomic_bool stop; // Used for stopping the threadpool altogether + atomic_bool pause; // Used for pausing the threadpool or individual threads + atomic_bool abort; // Used for aborting processing of a graph }; // Per-thread state diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index eab017889..678389763 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order { }; struct ggml_cgraph { + struct ggml_hash_set visited_hash_set; + + enum ggml_cgraph_eval_order order; + int size; // maximum number of nodes/leafs/grads/grad_accs int n_nodes; // number of nodes currently in use int n_leafs; // number of leafs currently in use @@ -293,10 +297,6 @@ struct ggml_cgraph { struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes struct ggml_tensor ** grad_accs; // accumulators for node gradients struct ggml_tensor ** leafs; // tensors with constant data - - struct ggml_hash_set visited_hash_set; - - enum ggml_cgraph_eval_order order; }; // returns a slice of cgraph with nodes [i0, i1) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b1d0d4913..18349a298 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz assert(obj_size == (size_t)((char *)p - (char *)cgraph)); *cgraph = (struct ggml_cgraph) { + /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, + /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.size =*/ size, /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, @@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.grads =*/ grads_ptr, /*.grad_accs =*/ grad_accs_ptr, /*.leafs =*/ leafs_ptr, - /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, - /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; ggml_hash_set_reset(&cgraph->visited_hash_set); @@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) { struct ggml_cgraph cgraph = { + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ cgraph0->order, /*.size =*/ 0, /*.n_nodes =*/ i1 - i0, /*.n_leafs =*/ 0, @@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.grads =*/ NULL, // gradients would need visited_hash_set /*.grad_accs =*/ NULL, /*.leafs =*/ NULL, - /*.visited_hash_set =*/ { 0, NULL, NULL }, - /*.order =*/ cgraph0->order, }; return cgraph; diff --git a/include/llama.h b/include/llama.h index 298b8d1bc..8c6657a5e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -285,6 +285,12 @@ extern "C" { // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE int32_t main_gpu; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data + // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; @@ -298,12 +304,6 @@ extern "C" { // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; - - // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool check_tensors; // validate model tensor data }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2d23a8d3..cc307d3a9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() { /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, - /*.tensor_split =*/ nullptr, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, - /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, + /*.tensor_split =*/ nullptr, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, + /*.kv_overrides =*/ nullptr, }; #ifdef GGML_USE_METAL diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9a680aed4..62b696bb7 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -17,6 +17,12 @@ #include #include +// disable C++11 deprecation warning non-constant-expression cannot be narrowed +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wc++11-narrowing" +#endif + // // helpers // @@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session { } // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores - std::vector tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX}); + std::vector tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX}); // at the beginning tokenization score is zero - tokenization_results[0] = { vocab.token_unk(), 0, 0 }; + tokenization_results[0] = { 0, vocab.token_unk(), 0 }; for (size_t input_offset = 0; input_offset < input_len;) { size_t prefix_offset = input_offset; @@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session { const double challenger_score = current_best.score_sum + token_score; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score }; + struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score }; current_champ = challenger; } } @@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session { prefix_offset = input_offset + n_utf8_code_units; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score }; + struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score }; current_champ = challenger; } } @@ -973,8 +979,8 @@ private: // this structure stores the best tokenization so far at input_offset struct best_tokenization { - llama_token token_id; size_t input_offset; + llama_token token_id; float score_sum; };