mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-04 15:43:53 +01:00
ggml: align structures for 64bit, reorder params and ignore error-warn for Clang 19
This commit is contained in:
parent
a1649cc13f
commit
9a2380ec32
@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
||||
// graph allocator
|
||||
|
||||
struct hash_node {
|
||||
size_t offset; // offset within the buffer
|
||||
int n_children;
|
||||
int n_views;
|
||||
int buffer_id;
|
||||
size_t offset; // offset within the buffer
|
||||
bool allocated;
|
||||
};
|
||||
|
||||
|
@ -1287,8 +1287,8 @@ typedef pthread_mutex_t ggml_mutex_t;
|
||||
|
||||
// Threadpool def
|
||||
struct ggml_threadpool {
|
||||
ggml_mutex_t mutex; // mutex for cond.var
|
||||
ggml_cond_t cond; // cond.var for waiting for new work
|
||||
ggml_mutex_t mutex; // mutex for cond.var
|
||||
|
||||
struct ggml_cgraph * cgraph;
|
||||
struct ggml_cplan * cplan;
|
||||
@ -1299,19 +1299,19 @@ struct ggml_threadpool {
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||
|
||||
// these are atomic as an annotation for thread-sanitizer
|
||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||
atomic_bool abort; // Used for aborting processing of a graph
|
||||
|
||||
struct ggml_compute_state * workers; // per thread state
|
||||
int n_threads_max; // number of threads in the pool
|
||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
||||
int n_threads_max; // number of threads in the pool
|
||||
|
||||
int32_t prio; // Scheduling priority
|
||||
uint32_t poll; // Polling level (0 - no polling)
|
||||
|
||||
enum ggml_status ec;
|
||||
|
||||
// these are atomic as an annotation for thread-sanitizer
|
||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||
atomic_bool abort; // Used for aborting processing of a graph
|
||||
};
|
||||
|
||||
// Per-thread state
|
||||
|
@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order {
|
||||
};
|
||||
|
||||
struct ggml_cgraph {
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
|
||||
int size; // maximum number of nodes/leafs/grads/grad_accs
|
||||
int n_nodes; // number of nodes currently in use
|
||||
int n_leafs; // number of leafs currently in use
|
||||
@ -293,10 +297,6 @@ struct ggml_cgraph {
|
||||
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
|
||||
struct ggml_tensor ** grad_accs; // accumulators for node gradients
|
||||
struct ggml_tensor ** leafs; // tensors with constant data
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
// returns a slice of cgraph with nodes [i0, i1)
|
||||
|
@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||
assert(obj_size == (size_t)((char *)p - (char *)cgraph));
|
||||
|
||||
*cgraph = (struct ggml_cgraph) {
|
||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||
/*.size =*/ size,
|
||||
/*.n_nodes =*/ 0,
|
||||
/*.n_leafs =*/ 0,
|
||||
@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||
/*.grads =*/ grads_ptr,
|
||||
/*.grad_accs =*/ grad_accs_ptr,
|
||||
/*.leafs =*/ leafs_ptr,
|
||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||
};
|
||||
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
||||
struct ggml_cgraph cgraph = {
|
||||
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
||||
/*.order =*/ cgraph0->order,
|
||||
/*.size =*/ 0,
|
||||
/*.n_nodes =*/ i1 - i0,
|
||||
/*.n_leafs =*/ 0,
|
||||
@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
|
||||
/*.grads =*/ NULL, // gradients would need visited_hash_set
|
||||
/*.grad_accs =*/ NULL,
|
||||
/*.leafs =*/ NULL,
|
||||
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
||||
/*.order =*/ cgraph0->order,
|
||||
};
|
||||
|
||||
return cgraph;
|
||||
|
@ -285,6 +285,12 @@ extern "C" {
|
||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||
int32_t main_gpu;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
@ -298,12 +304,6 @@ extern "C" {
|
||||
|
||||
// override key-value pairs of the model meta data
|
||||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -17,6 +17,12 @@
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
// disable C++11 deprecation warning non-constant-expression cannot be narrowed
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wc++11-narrowing"
|
||||
#endif
|
||||
|
||||
//
|
||||
// helpers
|
||||
//
|
||||
@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session {
|
||||
}
|
||||
|
||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
|
||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
|
||||
// at the beginning tokenization score is zero
|
||||
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
||||
tokenization_results[0] = { 0, vocab.token_unk(), 0 };
|
||||
|
||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||
size_t prefix_offset = input_offset;
|
||||
@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session {
|
||||
const double challenger_score = current_best.score_sum + token_score;
|
||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
|
||||
struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session {
|
||||
prefix_offset = input_offset + n_utf8_code_units;
|
||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
|
||||
struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
@ -973,8 +979,8 @@ private:
|
||||
|
||||
// this structure stores the best tokenization so far at input_offset
|
||||
struct best_tokenization {
|
||||
llama_token token_id;
|
||||
size_t input_offset;
|
||||
llama_token token_id;
|
||||
float score_sum;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user