From df4719ec7e3fa19617e671d75dcd8319b6777397 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Tue, 23 Apr 2024 06:27:08 -0700 Subject: [PATCH] Disable CUDA graphs for old GPU arch and with env var --- ggml-cuda.cu | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6f1973f2c..5b696719b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2419,9 +2419,12 @@ struct ggml_cudaGraph { int softmax_ne0 = 0; cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH]; cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH]; + bool disableDueToGpuArch=false; }; #endif +const bool disableCudaGraphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr); + GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -2437,8 +2440,21 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t // kernel parameters which need updated in the graph for each token void* ggmlCudaCpyFn = nullptr; - if(ggml_backend_cuda_get_device_count() > 1){ - useCudaGraph = false; // disable CUDA graphs for multi-gpu for now. TO DO investigate + + if(cudaGraph.count==0){ + cudaDeviceProp prop; + int device; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); + if (prop.major < 8){ + cudaGraph.disableDueToGpuArch=true; + } + } + + // Disable CUDA graphs in presence of env var or old GPU. + // Also disable for multi-gpu for now. TO DO investigate + if(disableCudaGraphs || cudaGraph.disableDueToGpuArch || ggml_backend_cuda_get_device_count() > 1){ + useCudaGraph = false; } if(useCudaGraph) {