cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
2024-12-24 13:28:50 +01:00 · 2023-11-05 10:08:57 -05:00 · 2023-11-05 10:08:57 -05:00 · 132d25b8a6
commit 132d25b8a6
parent 3d48f42efc
1 changed files with 13 additions and 3 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
    int64_t row_high[GGML_CUDA_MAX_DEVICES];
    int used_devices = 0;
    for (int64_t id = 0; id < g_device_count; ++id) {
        // by default, use all rows
        row_low[id]  = 0;
@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
            continue;
        }
        used_devices++;
        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
    // if multiple devices are used they need to wait for the main device
    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && g_device_count > 1) {
+    if (split && used_devices > 1) {
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
    }
-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
    }
    for (int64_t id = 0; id < g_device_count; ++id) {
        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
            continue;
        }
        CUDA_CHECK(ggml_cuda_set_device(id));
        // free buffers again when done
@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        for (int64_t id = 0; id < g_device_count; ++id) {
            if (row_low[id] == row_high[id]) {
                continue;
            }
            for (int64_t is = 0; is < is_max; ++is) {
                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
            }
@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
        (src1->backend == GGML_BACKEND_GPU) &&
        ( dst->backend == GGML_BACKEND_GPU);