From df4719ec7e3fa19617e671d75dcd8319b6777397 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Tue, 23 Apr 2024 06:27:08 -0700
Subject: [PATCH] Disable CUDA graphs for old GPU arch and with env var

---
 ggml-cuda.cu | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 6f1973f2c..5b696719b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2419,9 +2419,12 @@ struct ggml_cudaGraph {
     int softmax_ne0 = 0;
     cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH];
     cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
+    bool disableDueToGpuArch=false;
 };
 #endif
 
+const bool disableCudaGraphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr);
+
 GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
@@ -2437,8 +2440,21 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
     // kernel parameters which need updated in the graph for each token
     void* ggmlCudaCpyFn = nullptr;
 
-    if(ggml_backend_cuda_get_device_count() > 1){
-        useCudaGraph = false; // disable CUDA graphs for multi-gpu for now. TO DO investigate
+
+    if(cudaGraph.count==0){        
+        cudaDeviceProp prop;
+        int device;
+        cudaGetDevice(&device);
+        cudaGetDeviceProperties(&prop, device);
+        if (prop.major < 8){
+            cudaGraph.disableDueToGpuArch=true;
+        }
+    }
+
+    // Disable CUDA graphs in presence of env var or old GPU.
+    // Also disable for multi-gpu for now. TO DO investigate
+    if(disableCudaGraphs || cudaGraph.disableDueToGpuArch || ggml_backend_cuda_get_device_count() > 1){
+        useCudaGraph = false;
     }
 
     if(useCudaGraph) {