mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 13:58:46 +01:00
With mechanism to fall back if graph capture fails
This commit is contained in:
parent
d44e0fb22c
commit
eb9f15fb6f
33
ggml-cuda.cu
33
ggml-cuda.cu
@ -48,11 +48,20 @@
|
|||||||
|
|
||||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
[[noreturn]]
|
static bool disable_cuda_graphs_due_to_failed_capture = false;
|
||||||
|
|
||||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||||
int id = -1; // in case cudaGetDevice fails
|
int id = -1; // in case cudaGetDevice fails
|
||||||
cudaGetDevice(&id);
|
cudaGetDevice(&id);
|
||||||
|
|
||||||
|
if(strcmp(msg,"operation not permitted when stream is capturing")==0 ||
|
||||||
|
strcmp(msg,"operation failed due to a previous error during capture")==0) {
|
||||||
|
// CUDA graph capture has failed, but we can fall back to regular stream-based CUDA
|
||||||
|
// so mark as failed, clear the error and return.
|
||||||
|
disable_cuda_graphs_due_to_failed_capture = true;
|
||||||
|
cudaGetLastError();
|
||||||
|
return;
|
||||||
|
}
|
||||||
fprintf(stderr, "CUDA error: %s\n", msg);
|
fprintf(stderr, "CUDA error: %s\n", msg);
|
||||||
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||||
fprintf(stderr, " %s\n", stmt);
|
fprintf(stderr, " %s\n", stmt);
|
||||||
@ -2428,6 +2437,7 @@ struct ggml_cuda_graph {
|
|||||||
cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
|
cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
|
||||||
bool disable_due_to_gpu_arch = false;
|
bool disable_due_to_gpu_arch = false;
|
||||||
bool disable_due_to_too_many_updates = false;
|
bool disable_due_to_too_many_updates = false;
|
||||||
|
bool disable_due_to_failed_graph_capture = false;
|
||||||
int number_consecutive_updates = 0;
|
int number_consecutive_updates = 0;
|
||||||
ggml_graph_node_properties ggml_graph_properties[MAX_NODES_IN_CUDA_GRAPH];
|
ggml_graph_node_properties ggml_graph_properties[MAX_NODES_IN_CUDA_GRAPH];
|
||||||
};
|
};
|
||||||
@ -2481,9 +2491,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disable CUDA graphs in presence of env var, old GPU or use-case which is changing too rapidly.
|
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
||||||
|
// or previous graph capture failure.
|
||||||
// Also disable for multi-gpu for now. TO DO investigate
|
// Also disable for multi-gpu for now. TO DO investigate
|
||||||
if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || cuda_graph.disable_due_to_too_many_updates ||
|
if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch ||
|
||||||
|
cuda_graph.disable_due_to_too_many_updates || cuda_graph.disable_due_to_failed_graph_capture ||
|
||||||
ggml_backend_cuda_get_device_count() > 1){
|
ggml_backend_cuda_get_device_count() > 1){
|
||||||
use_cuda_graph = false;
|
use_cuda_graph = false;
|
||||||
}
|
}
|
||||||
@ -2541,10 +2553,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
bool cuda_graph_update_required = false;
|
bool cuda_graph_update_required = false;
|
||||||
#endif // USE_CUDA_GRAPH
|
#endif // USE_CUDA_GRAPH
|
||||||
|
|
||||||
|
bool graph_evaluated_or_captured = false;
|
||||||
|
|
||||||
|
while(!graph_evaluated_or_captured) {
|
||||||
|
// Temporarily avoid indenting here (and below the following if) to make code review easier
|
||||||
|
|
||||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||||
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
||||||
if(!use_cuda_graph || cuda_graph_update_required) {
|
if(!use_cuda_graph || cuda_graph_update_required) {
|
||||||
//temporarily avoid indenting here to make code review easier
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
@ -2572,6 +2589,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
#ifdef USE_CUDA_GRAPH
|
#ifdef USE_CUDA_GRAPH
|
||||||
if(use_cuda_graph && (cuda_graph_update_required)) { // End CUDA graph capture
|
if(use_cuda_graph && (cuda_graph_update_required)) { // End CUDA graph capture
|
||||||
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph.graph));
|
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph.graph));
|
||||||
|
if(disable_cuda_graphs_due_to_failed_capture) {
|
||||||
|
use_cuda_graph = false;
|
||||||
|
cuda_graph.disable_due_to_failed_graph_capture = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
graph_evaluated_or_captured = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if(use_cuda_graph){
|
if(use_cuda_graph){
|
||||||
|
|
||||||
|
@ -172,7 +172,6 @@
|
|||||||
|
|
||||||
#define GGML_CUDA_MAX_STREAMS 8
|
#define GGML_CUDA_MAX_STREAMS 8
|
||||||
|
|
||||||
[[noreturn]]
|
|
||||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
|
||||||
|
|
||||||
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
||||||
|
Loading…
Reference in New Issue
Block a user