mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-30 13:53:03 +01:00
disable for multi-gpu and batch size > 1
This commit is contained in:
parent
800f4fe48e
commit
c2691d968a
@ -2436,6 +2436,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
// pointer to CUDA cpy kernel, which is required to identify
|
// pointer to CUDA cpy kernel, which is required to identify
|
||||||
// kernel parameters which need updated in the graph for each token
|
// kernel parameters which need updated in the graph for each token
|
||||||
void* ggmlCudaCpyFn = nullptr;
|
void* ggmlCudaCpyFn = nullptr;
|
||||||
|
|
||||||
|
if(ggml_backend_cuda_get_device_count() > 1){
|
||||||
|
useCudaGraph = false; // disable CUDA graphs for multi-gpu for now. TO DO investigate
|
||||||
|
}
|
||||||
|
|
||||||
if(useCudaGraph) {
|
if(useCudaGraph) {
|
||||||
|
|
||||||
if(cudaGraph.instance == nullptr) cudaGraphUpdateRequired=true;
|
if(cudaGraph.instance == nullptr) cudaGraphUpdateRequired=true;
|
||||||
@ -2447,6 +2452,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
// Identify if the graph needs updated for this token due to the number of elements changing
|
// Identify if the graph needs updated for this token due to the number of elements changing
|
||||||
// (identified by inspecting soft max op parameters)
|
// (identified by inspecting soft max op parameters)
|
||||||
if(node->op == GGML_OP_SOFT_MAX) {
|
if(node->op == GGML_OP_SOFT_MAX) {
|
||||||
|
if(node->src[1]->ne[1] > 1){
|
||||||
|
useCudaGraph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate
|
||||||
|
}
|
||||||
if(node->src[0]->ne[0] != cudaGraph.softmax_ne0) {
|
if(node->src[0]->ne[0] != cudaGraph.softmax_ne0) {
|
||||||
cudaGraphUpdateRequired = true;
|
cudaGraphUpdateRequired = true;
|
||||||
cudaGraph.softmax_ne0 = node->src[0]->ne[0];
|
cudaGraph.softmax_ne0 = node->src[0]->ne[0];
|
||||||
|
Loading…
Reference in New Issue
Block a user