ggml: align structures for 64bit, reorder params and ignore error-warn for Clang 19

This commit is contained in:
Herman Semenov 2025-01-18 19:33:02 +03:00
parent a1649cc13f
commit 9a2380ec32
7 changed files with 37 additions and 31 deletions

View File

@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
// graph allocator
struct hash_node {
size_t offset; // offset within the buffer
int n_children;
int n_views;
int buffer_id;
size_t offset; // offset within the buffer
bool allocated;
};

View File

@ -1287,8 +1287,8 @@ typedef pthread_mutex_t ggml_mutex_t;
// Threadpool def
struct ggml_threadpool {
ggml_mutex_t mutex; // mutex for cond.var
ggml_cond_t cond; // cond.var for waiting for new work
ggml_mutex_t mutex; // mutex for cond.var
struct ggml_cgraph * cgraph;
struct ggml_cplan * cplan;
@ -1299,19 +1299,19 @@ struct ggml_threadpool {
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
// these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether
atomic_bool pause; // Used for pausing the threadpool or individual threads
atomic_bool abort; // Used for aborting processing of a graph
struct ggml_compute_state * workers; // per thread state
int n_threads_max; // number of threads in the pool
atomic_int n_threads_cur; // number of threads used in the current graph
int n_threads_max; // number of threads in the pool
int32_t prio; // Scheduling priority
uint32_t poll; // Polling level (0 - no polling)
enum ggml_status ec;
// these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether
atomic_bool pause; // Used for pausing the threadpool or individual threads
atomic_bool abort; // Used for aborting processing of a graph
};
// Per-thread state

View File

@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order {
};
struct ggml_cgraph {
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
int size; // maximum number of nodes/leafs/grads/grad_accs
int n_nodes; // number of nodes currently in use
int n_leafs; // number of leafs currently in use
@ -293,10 +297,6 @@ struct ggml_cgraph {
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
struct ggml_tensor ** grad_accs; // accumulators for node gradients
struct ggml_tensor ** leafs; // tensors with constant data
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
};
// returns a slice of cgraph with nodes [i0, i1)

View File

@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
assert(obj_size == (size_t)((char *)p - (char *)cgraph));
*cgraph = (struct ggml_cgraph) {
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
/*.size =*/ size,
/*.n_nodes =*/ 0,
/*.n_leafs =*/ 0,
@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.grads =*/ grads_ptr,
/*.grad_accs =*/ grad_accs_ptr,
/*.leafs =*/ leafs_ptr,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
ggml_hash_set_reset(&cgraph->visited_hash_set);
@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
struct ggml_cgraph cgraph = {
/*.visited_hash_set =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order,
/*.size =*/ 0,
/*.n_nodes =*/ i1 - i0,
/*.n_leafs =*/ 0,
@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.grads =*/ NULL, // gradients would need visited_hash_set
/*.grad_accs =*/ NULL,
/*.leafs =*/ NULL,
/*.visited_hash_set =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order,
};
return cgraph;

View File

@ -285,6 +285,12 @@ extern "C" {
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
int32_t main_gpu;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
@ -298,12 +304,6 @@ extern "C" {
// override key-value pairs of the model meta data
const struct llama_model_kv_override * kv_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

View File

@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() {
/*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.tensor_split =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
};
#ifdef GGML_USE_METAL

View File

@ -17,6 +17,12 @@
#include <set>
#include <unordered_map>
// disable C++11 deprecation warning non-constant-expression cannot be narrowed
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wc++11-narrowing"
#endif
//
// helpers
//
@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session {
}
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
// at the beginning tokenization score is zero
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
tokenization_results[0] = { 0, vocab.token_unk(), 0 };
for (size_t input_offset = 0; input_offset < input_len;) {
size_t prefix_offset = input_offset;
@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session {
const double challenger_score = current_best.score_sum + token_score;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
current_champ = challenger;
}
}
@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session {
prefix_offset = input_offset + n_utf8_code_units;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
current_champ = challenger;
}
}
@ -973,8 +979,8 @@ private:
// this structure stores the best tokenization so far at input_offset
struct best_tokenization {
llama_token token_id;
size_t input_offset;
llama_token token_id;
float score_sum;
};