mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-06 16:40:34 +01:00
backend : group nodes in a single compute when user don't need them
This commit is contained in:
parent
65648b341f
commit
01b6f68a00
@ -8,11 +8,16 @@
|
|||||||
|
|
||||||
// a function that can be called for every computed node during graph evaluation
|
// a function that can be called for every computed node during graph evaluation
|
||||||
// the user can choose to whether to observe the data of the node depending on the tensor parameters
|
// the user can choose to whether to observe the data of the node depending on the tensor parameters
|
||||||
static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_data) {
|
static bool observe_compute(int node_index, struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
GGML_UNUSED(user_data);
|
GGML_UNUSED(user_data);
|
||||||
|
|
||||||
|
// the scheduler is asking us if we want to observe this node
|
||||||
|
if (ask) {
|
||||||
// check if name contains soft_max
|
// check if name contains soft_max
|
||||||
if (strstr(t->name, "soft_max") != 0) {
|
return strstr(t->name, "soft_max") != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// print the node data
|
||||||
printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
|
printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
|
||||||
__func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
__func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||||
|
|
||||||
@ -24,7 +29,6 @@ static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_
|
|||||||
printf("%8.4f ", t_data[i]);
|
printf("%8.4f ", t_data[i]);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1337,18 +1337,25 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|||||||
for (int j = 0; j < split->graph.n_nodes; j++) {
|
for (int j = 0; j < split->graph.n_nodes; j++) {
|
||||||
struct ggml_tensor * t = split->graph.nodes[j];
|
struct ggml_tensor * t = split->graph.nodes[j];
|
||||||
|
|
||||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, j + 1);
|
int k = j;
|
||||||
|
|
||||||
|
// check if the user needs data from this node
|
||||||
|
while (!sched->callback_eval(k, t, true, sched->callback_eval_user_data) && k < split->graph.n_nodes - 1) {
|
||||||
|
t = split->graph.nodes[++k];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, k + 1);
|
||||||
|
|
||||||
ggml_backend_graph_compute(split_backend, &gv);
|
ggml_backend_graph_compute(split_backend, &gv);
|
||||||
|
|
||||||
if (ggml_is_view_op(t->op)) {
|
// TODO: k is node index in the split, not in the original graph
|
||||||
continue;
|
// TODO: avoid the ask == true call here
|
||||||
}
|
if (sched->callback_eval(k, t, true, sched->callback_eval_user_data) &&
|
||||||
|
!sched->callback_eval(k, t, false, sched->callback_eval_user_data)) {
|
||||||
// TODO: j is node index in the split, not in the original graph
|
|
||||||
if (!sched->callback_eval(j, t, sched->callback_eval_user_data)) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
j = k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint64_t compute_end_us = ggml_time_us();
|
uint64_t compute_end_us = ggml_time_us();
|
||||||
|
@ -148,8 +148,14 @@ extern "C" {
|
|||||||
struct ggml_backend_sched;
|
struct ggml_backend_sched;
|
||||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||||
|
|
||||||
|
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||||
|
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||||
|
//
|
||||||
|
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||||
|
// if the user returns false, the scheduler will cancel the graph compute
|
||||||
|
//
|
||||||
// TODO: propose to rename to ggml_backend_sched_callback_eval
|
// TODO: propose to rename to ggml_backend_sched_callback_eval
|
||||||
typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, void * user_data);
|
typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
// Initialize a backend scheduler
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||||
|
Loading…
Reference in New Issue
Block a user