diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d277104d1..07a3bc32e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2405,11 +2405,63 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { GGML_UNUSED(backend); } +struct ggml_cudaGraph { + int count=0; + cudaGraph_t graph = nullptr; + cudaGraphExec_t instance = nullptr; + size_t numNodes = 0; + int softmax_ne0 = 0; +}; + GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); + // Objects required for CUDA Graph +#define MAX_NODES_IN_CUDA_GRAPH 10000 + static ggml_cudaGraph cudaGraph; //TO DO move this to a suitable persistant location (and avoid use of static memory) + bool useCudaGraph = (cudaGraph.count>=2); //avoid CUDA graphs on first 2 steps due to incompatible initialisations. + char** updatedKernelArg[MAX_NODES_IN_CUDA_GRAPH]; + bool cudaGraphUpdateRequired = false; + // pointer to CUDA cpy kernel, which is required to identify + // kernel parameters which need updated in the graph for each token + void* ggmlCudaCpyFn = nullptr; + if(useCudaGraph) { + + if(cudaGraph.instance == nullptr) cudaGraphUpdateRequired=true; + + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + int k=0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + // Identify if the graph needs updated for this token due to the number of elements changing + // (identified by inspecting soft max op parameters) + if(node->op == GGML_OP_SOFT_MAX) { + if(node->src[0]->ne[0] != cudaGraph.softmax_ne0) { + cudaGraphUpdateRequired = true; + cudaGraph.softmax_ne0 = node->src[0]->ne[0]; + } + } + if(node->op == GGML_OP_CPY) { + // store the copy op parameter which changes with each token. + updatedKernelArg[k++]=(char**) &(node->src[1]->data); + if(ggmlCudaCpyFn == nullptr){ + // store a pointer to the copy op CUDA kernel to identify it later + ggmlCudaCpyFn = ggml_cuda_cpy_fn(node->src[0], node->src[1]); + } + } + } + } + + if(useCudaGraph && cudaGraphUpdateRequired) { // Start CUDA graph capture + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeGlobal)); + } + + // Only perfom the graph exection if CUDA graphs are not enebled, or we are capturing the graph. + // With use of CUDA graphs, the execution will be performed by the graph launch. + if(!useCudaGraph || cudaGraphUpdateRequired) { + //temporarily avoid indenting here to make code review easier for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2432,7 +2484,67 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t } GGML_ASSERT(ok); } + } + if(useCudaGraph && (cudaGraphUpdateRequired)) { // End CUDA graph capture + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cudaGraph.graph)); + } + if(useCudaGraph){ + + if(cudaGraph.instance == nullptr) { // Create executable graph from captured graph. + CUDA_CHECK(cudaGraphInstantiate(&cudaGraph.instance, cudaGraph.graph, NULL, NULL, 0)); + } + + + // Perform update to graph (if required for this token), and change copy parameter (required for every token) + + cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH]; + CUDA_KERNEL_NODE_PARAMS_v2 paramsDriver[MAX_NODES_IN_CUDA_GRAPH]; + cudaKernelNodeParams paramsRuntime[MAX_NODES_IN_CUDA_GRAPH]; + + if(cudaGraphUpdateRequired) { + // Extract nodes from graph + if(cudaGraph.numNodes == 0) { + CUDA_CHECK(cudaGraphGetNodes(cudaGraph.graph, nullptr, &cudaGraph.numNodes)); + } + CUDA_CHECK(cudaGraphGetNodes(cudaGraph.graph, nodes, &cudaGraph.numNodes)); + + // Loop over nodes, and extract kernel parameters fro each node + for(size_t i=0; istream())); + } + cudaGraph.count++; return GGML_STATUS_SUCCESS; } diff --git a/ggml-cuda/cpy.cu b/ggml-cuda/cpy.cu index 16d9c8fff..12d741f01 100644 --- a/ggml-cuda/cpy.cu +++ b/ggml-cuda/cpy.cu @@ -459,3 +459,32 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; ggml_cuda_cpy(ctx, src0, dst); } + +void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_f32_f16; + } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + diff --git a/ggml-cuda/cpy.cuh b/ggml-cuda/cpy.cuh index f0b2c453b..796167426 100644 --- a/ggml-cuda/cpy.cuh +++ b/ggml-cuda/cpy.cuh @@ -5,3 +5,5 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);