From c2691d968ac5f5ea79e8e1fed015ac8cf279a732 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Mon, 22 Apr 2024 09:01:44 -0700
Subject: [PATCH] disable for multi-gpu and batch size > 1

---
 ggml-cuda.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7da061240..6f1973f2c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2436,6 +2436,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
     // pointer to CUDA cpy kernel, which is required to identify
     // kernel parameters which need updated in the graph for each token
     void* ggmlCudaCpyFn = nullptr;
+
+    if(ggml_backend_cuda_get_device_count() > 1){
+        useCudaGraph = false; // disable CUDA graphs for multi-gpu for now. TO DO investigate
+    }
+
     if(useCudaGraph) {
 
         if(cudaGraph.instance == nullptr) cudaGraphUpdateRequired=true;
@@ -2447,6 +2452,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
             // Identify if the graph needs updated for this token due to the number of elements changing
             // (identified by inspecting soft max op parameters)
             if(node->op == GGML_OP_SOFT_MAX) {
+                if(node->src[1]->ne[1] > 1){
+                    useCudaGraph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate
+                }
                 if(node->src[0]->ne[0] != cudaGraph.softmax_ne0) {
                     cudaGraphUpdateRequired = true;
                     cudaGraph.softmax_ne0 = node->src[0]->ne[0];