mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-10 02:03:07 +01:00
ggml: align structures for 64bit, reorder params and ignore error-warn for Clang 19
This commit is contained in:
parent
a1649cc13f
commit
9a2380ec32
@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
|||||||
// graph allocator
|
// graph allocator
|
||||||
|
|
||||||
struct hash_node {
|
struct hash_node {
|
||||||
|
size_t offset; // offset within the buffer
|
||||||
int n_children;
|
int n_children;
|
||||||
int n_views;
|
int n_views;
|
||||||
int buffer_id;
|
int buffer_id;
|
||||||
size_t offset; // offset within the buffer
|
|
||||||
bool allocated;
|
bool allocated;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1287,8 +1287,8 @@ typedef pthread_mutex_t ggml_mutex_t;
|
|||||||
|
|
||||||
// Threadpool def
|
// Threadpool def
|
||||||
struct ggml_threadpool {
|
struct ggml_threadpool {
|
||||||
ggml_mutex_t mutex; // mutex for cond.var
|
|
||||||
ggml_cond_t cond; // cond.var for waiting for new work
|
ggml_cond_t cond; // cond.var for waiting for new work
|
||||||
|
ggml_mutex_t mutex; // mutex for cond.var
|
||||||
|
|
||||||
struct ggml_cgraph * cgraph;
|
struct ggml_cgraph * cgraph;
|
||||||
struct ggml_cplan * cplan;
|
struct ggml_cplan * cplan;
|
||||||
@ -1299,19 +1299,19 @@ struct ggml_threadpool {
|
|||||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||||
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||||
|
|
||||||
// these are atomic as an annotation for thread-sanitizer
|
|
||||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
||||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
|
||||||
atomic_bool abort; // Used for aborting processing of a graph
|
|
||||||
|
|
||||||
struct ggml_compute_state * workers; // per thread state
|
struct ggml_compute_state * workers; // per thread state
|
||||||
int n_threads_max; // number of threads in the pool
|
|
||||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
atomic_int n_threads_cur; // number of threads used in the current graph
|
||||||
|
int n_threads_max; // number of threads in the pool
|
||||||
|
|
||||||
int32_t prio; // Scheduling priority
|
int32_t prio; // Scheduling priority
|
||||||
uint32_t poll; // Polling level (0 - no polling)
|
uint32_t poll; // Polling level (0 - no polling)
|
||||||
|
|
||||||
enum ggml_status ec;
|
enum ggml_status ec;
|
||||||
|
|
||||||
|
// these are atomic as an annotation for thread-sanitizer
|
||||||
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||||
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||||
|
atomic_bool abort; // Used for aborting processing of a graph
|
||||||
};
|
};
|
||||||
|
|
||||||
// Per-thread state
|
// Per-thread state
|
||||||
|
@ -285,6 +285,10 @@ enum ggml_cgraph_eval_order {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
|
struct ggml_hash_set visited_hash_set;
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
int size; // maximum number of nodes/leafs/grads/grad_accs
|
int size; // maximum number of nodes/leafs/grads/grad_accs
|
||||||
int n_nodes; // number of nodes currently in use
|
int n_nodes; // number of nodes currently in use
|
||||||
int n_leafs; // number of leafs currently in use
|
int n_leafs; // number of leafs currently in use
|
||||||
@ -293,10 +297,6 @@ struct ggml_cgraph {
|
|||||||
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
|
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
|
||||||
struct ggml_tensor ** grad_accs; // accumulators for node gradients
|
struct ggml_tensor ** grad_accs; // accumulators for node gradients
|
||||||
struct ggml_tensor ** leafs; // tensors with constant data
|
struct ggml_tensor ** leafs; // tensors with constant data
|
||||||
|
|
||||||
struct ggml_hash_set visited_hash_set;
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// returns a slice of cgraph with nodes [i0, i1)
|
// returns a slice of cgraph with nodes [i0, i1)
|
||||||
|
@ -5932,6 +5932,8 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
|||||||
assert(obj_size == (size_t)((char *)p - (char *)cgraph));
|
assert(obj_size == (size_t)((char *)p - (char *)cgraph));
|
||||||
|
|
||||||
*cgraph = (struct ggml_cgraph) {
|
*cgraph = (struct ggml_cgraph) {
|
||||||
|
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||||
|
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||||
/*.size =*/ size,
|
/*.size =*/ size,
|
||||||
/*.n_nodes =*/ 0,
|
/*.n_nodes =*/ 0,
|
||||||
/*.n_leafs =*/ 0,
|
/*.n_leafs =*/ 0,
|
||||||
@ -5939,8 +5941,6 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
|||||||
/*.grads =*/ grads_ptr,
|
/*.grads =*/ grads_ptr,
|
||||||
/*.grad_accs =*/ grad_accs_ptr,
|
/*.grad_accs =*/ grad_accs_ptr,
|
||||||
/*.leafs =*/ leafs_ptr,
|
/*.leafs =*/ leafs_ptr,
|
||||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
|
||||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
@ -5958,6 +5958,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|||||||
|
|
||||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
||||||
struct ggml_cgraph cgraph = {
|
struct ggml_cgraph cgraph = {
|
||||||
|
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
||||||
|
/*.order =*/ cgraph0->order,
|
||||||
/*.size =*/ 0,
|
/*.size =*/ 0,
|
||||||
/*.n_nodes =*/ i1 - i0,
|
/*.n_nodes =*/ i1 - i0,
|
||||||
/*.n_leafs =*/ 0,
|
/*.n_leafs =*/ 0,
|
||||||
@ -5965,8 +5967,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
|
|||||||
/*.grads =*/ NULL, // gradients would need visited_hash_set
|
/*.grads =*/ NULL, // gradients would need visited_hash_set
|
||||||
/*.grad_accs =*/ NULL,
|
/*.grad_accs =*/ NULL,
|
||||||
/*.leafs =*/ NULL,
|
/*.leafs =*/ NULL,
|
||||||
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
|
||||||
/*.order =*/ cgraph0->order,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return cgraph;
|
return cgraph;
|
||||||
|
@ -285,6 +285,12 @@ extern "C" {
|
|||||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||||
int32_t main_gpu;
|
int32_t main_gpu;
|
||||||
|
|
||||||
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
|
bool use_mmap; // use mmap if possible
|
||||||
|
bool use_mlock; // force system to keep model in RAM
|
||||||
|
bool check_tensors; // validate model tensor data
|
||||||
|
|
||||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
const float * tensor_split;
|
const float * tensor_split;
|
||||||
|
|
||||||
@ -298,12 +304,6 @@ extern "C" {
|
|||||||
|
|
||||||
// override key-value pairs of the model meta data
|
// override key-value pairs of the model meta data
|
||||||
const struct llama_model_kv_override * kv_overrides;
|
const struct llama_model_kv_override * kv_overrides;
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
|
||||||
bool use_mmap; // use mmap if possible
|
|
||||||
bool use_mlock; // force system to keep model in RAM
|
|
||||||
bool check_tensors; // validate model tensor data
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
@ -3716,14 +3716,14 @@ struct llama_model_params llama_model_default_params() {
|
|||||||
/*.n_gpu_layers =*/ 0,
|
/*.n_gpu_layers =*/ 0,
|
||||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
|
||||||
/*.progress_callback =*/ nullptr,
|
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
|
||||||
/*.kv_overrides =*/ nullptr,
|
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
|
/*.tensor_split =*/ nullptr,
|
||||||
|
/*.progress_callback =*/ nullptr,
|
||||||
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
|
/*.kv_overrides =*/ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -17,6 +17,12 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// disable C++11 deprecation warning non-constant-expression cannot be narrowed
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic push
|
||||||
|
# pragma clang diagnostic ignored "-Wc++11-narrowing"
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// helpers
|
// helpers
|
||||||
//
|
//
|
||||||
@ -803,9 +809,9 @@ struct llm_tokenizer_ugm_session {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
|
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
|
||||||
// at the beginning tokenization score is zero
|
// at the beginning tokenization score is zero
|
||||||
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
tokenization_results[0] = { 0, vocab.token_unk(), 0 };
|
||||||
|
|
||||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||||
size_t prefix_offset = input_offset;
|
size_t prefix_offset = input_offset;
|
||||||
@ -835,7 +841,7 @@ struct llm_tokenizer_ugm_session {
|
|||||||
const double challenger_score = current_best.score_sum + token_score;
|
const double challenger_score = current_best.score_sum + token_score;
|
||||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
if (challenger_score > current_champ.score_sum) {
|
if (challenger_score > current_champ.score_sum) {
|
||||||
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
|
struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
|
||||||
current_champ = challenger;
|
current_champ = challenger;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -849,7 +855,7 @@ struct llm_tokenizer_ugm_session {
|
|||||||
prefix_offset = input_offset + n_utf8_code_units;
|
prefix_offset = input_offset + n_utf8_code_units;
|
||||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
if (challenger_score > current_champ.score_sum) {
|
if (challenger_score > current_champ.score_sum) {
|
||||||
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
|
struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
|
||||||
current_champ = challenger;
|
current_champ = challenger;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -973,8 +979,8 @@ private:
|
|||||||
|
|
||||||
// this structure stores the best tokenization so far at input_offset
|
// this structure stores the best tokenization so far at input_offset
|
||||||
struct best_tokenization {
|
struct best_tokenization {
|
||||||
llama_token token_id;
|
|
||||||
size_t input_offset;
|
size_t input_offset;
|
||||||
|
llama_token token_id;
|
||||||
float score_sum;
|
float score_sum;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user