llama : remove MPI backend (#7395)

2024-12-25 22:08:46 +01:00 · 2024-05-20 01:17:03 +02:00 · 2024-05-20 01:17:03 +02:00 · d359f30921
commit d359f30921
parent 1ea2a0036e
9 changed files with 2 additions and 425 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_CUDA" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -306,40 +306,6 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
    continue-on-error: true
    strategy:
      matrix:
        mpi_library: [mpich, libopenmpi-dev]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ${{ matrix.mpi_library }}
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake -DLLAMA_MPI=ON ..
          cmake --build . --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose
  ubuntu-latest-cmake-rpc:
    runs-on: ubuntu-latest
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -122,7 +122,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "llama: metal minimum macOS version")
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
@ -466,35 +465,6 @@ if (LLAMA_CUDA)
    endif()
 endif()
 if (LLAMA_MPI)
    cmake_minimum_required(VERSION 3.10)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
        set(GGML_SOURCES_MPI ggml-mpi.c)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
            add_compile_options(-Wno-cast-qual)
        endif()
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
        # C++ functions, so more linkage is needed
        if (MPI_CXX_FOUND)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
        endif()
    else()
        message(WARNING "MPI not found")
    endif()
 endif()
 if (LLAMA_RPC)
    add_compile_definitions(GGML_USE_RPC)
@ -1218,7 +1188,6 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
@ -1306,7 +1275,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
+        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
--- a/12
+++ b/12
@ -399,13 +399,6 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_MPI
 	MK_CPPFLAGS += -DGGML_USE_MPI
 	MK_CFLAGS   += -Wno-cast-qual
 	MK_CXXFLAGS += -Wno-cast-qual
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@ -629,11 +622,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_MPI
 ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
--- a/README.md
+++ b/README.md
@ -382,45 +382,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ### MPI Build
 MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
 First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
 Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
 - Using `make`:
  ```bash
  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
  ```
 - Using `CMake`:
  ```bash
  cmake -S . -B build -DLLAMA_MPI=ON
  ```
 Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
 Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
 Here is an example hostfile:
 ```
 192.168.0.1:2
 malvolio.local:1
 ```
 The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
 Finally, you're ready to run a computation using `mpirun`:
 ```bash
 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 ### BLAS Build
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@ -1,216 +0,0 @@
 #include "ggml-mpi.h"
 #include "ggml.h"
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define UNUSED GGML_UNUSED
 struct ggml_mpi_context {
    int rank;
    int size;
 };
 void ggml_mpi_backend_init(void) {
    MPI_Init(NULL, NULL);
 }
 void ggml_mpi_backend_free(void) {
    MPI_Finalize();
 }
 struct ggml_mpi_context * ggml_mpi_init(void) {
    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
    return ctx;
 }
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
    free(ctx);
 }
 int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
    return ctx->rank;
 }
 void ggml_mpi_eval_init(
        struct ggml_mpi_context * ctx_mpi,
                            int * n_tokens,
                            int * n_past,
                            int * n_threads) {
    UNUSED(ctx_mpi);
    // synchronize the worker node parameters with the root node
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
 }
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
    if (t == NULL) {
        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
        return -1;
    }
    for (int i = 0; i < gf->n_nodes; i++) {
        if (gf->nodes[i] == t) {
            return i;
        }
    }
    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
    return -1;
 }
 static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
    MPI_Datatype mpi_type;
    switch (t->type) {
        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
        default: GGML_ASSERT(false && "not implemented");
    }
    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
    GGML_ASSERT(retval == MPI_SUCCESS);
 }
 static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
    MPI_Datatype mpi_type;
    switch (t->type) {
        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
        default: GGML_ASSERT(false && "not implemented");
    }
    MPI_Status status; UNUSED(status);
    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    GGML_ASSERT(retval == MPI_SUCCESS);
 }
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
             struct ggml_cgraph * gf,
                            int   n_layers) {
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;
    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
    if (inp_tokens == NULL) {
        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
        return;
    }
    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
    if (inp0 == NULL) {
        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
        return;
    }
    GGML_ASSERT(inp0 == gf->nodes[0]);
    // distribute the compute graph into slices across the MPI nodes
    //
    // the main node (0) processes the last layers + the remainder of the compute graph
    // and is responsible to pass the input tokens to the first node (1)
    //
    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
    // ...
    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
    // node 0:   [(n-1) * n_per_node,            n_nodes)
    //
    if (mpi_rank > 0) {
        if (mpi_rank == 1) {
            // the first node (1) receives the input tokens from the main node (0)
            ggml_mpi_tensor_recv(inp_tokens, 0);
        } else {
            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
        }
    } else if (mpi_size > 1) {
        // node 0 sends the input tokens to node 1
        ggml_mpi_tensor_send(inp_tokens, 1);
        // recv the output data from the last node
        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
    }
    {
        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
        const int il0 =               (mpi_idx + 0) * n_per_node;
        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
        char name_l0[GGML_MAX_NAME];
        char name_l1[GGML_MAX_NAME];
        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
        if (idx_l0 < 0 || idx_l1 < 0) {
            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
            return;
        }
        // attach the input data to all nodes that need it
        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
        for (int i = idx_l0; i < idx_l1; i++) {
            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
                gf->nodes[i]->src[0] =  inp0;
            }
            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
                gf->nodes[i]->src[1] =  inp0;
            }
        }
        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
        for (int i = 1; i < idx_l1 - idx_l0; i++) {
            gf->nodes[i] = gf->nodes[idx_l0 + i];
            gf->grads[i] = gf->grads[idx_l0 + i];
        }
        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
        if (mpi_idx != 0) {
            gf->nodes[0]->op = GGML_OP_NONE;
        }
        gf->n_nodes = idx_l1 - idx_l0;
        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
    }
 }
 void ggml_mpi_graph_compute_post(
        struct ggml_mpi_context * ctx_mpi,
             struct ggml_cgraph * gf,
                            int   n_layers) {
    UNUSED(n_layers);
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;
    // send the output data to the next node
    if (mpi_rank > 0) {
        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
    }
 }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@ -1,39 +0,0 @@
 #pragma once
 struct ggml_context;
 struct ggml_tensor;
 struct ggml_cgraph;
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct ggml_mpi_context;
 void ggml_mpi_backend_init(void);
 void ggml_mpi_backend_free(void);
 struct ggml_mpi_context * ggml_mpi_init(void);
 void ggml_mpi_free(struct ggml_mpi_context * ctx);
 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
 void ggml_mpi_eval_init(
        struct ggml_mpi_context * ctx_mpi,
                            int * n_tokens,
                            int * n_past,
                            int * n_threads);
 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
             struct ggml_cgraph * gf,
                            int   n_layers);
 void ggml_mpi_graph_compute_post(
        struct ggml_mpi_context * ctx_mpi,
             struct ggml_cgraph * gf,
                            int   n_layers);
 #ifdef __cplusplus
 }
 #endif
--- a/llama.cpp
+++ b/llama.cpp
@ -26,9 +26,6 @@
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
 #    define QK_K 64
@ -2270,10 +2267,6 @@ struct llama_context {
    // control vectors
    struct llama_control_vector cvec;
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
 #endif
 };
 static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@ -6336,9 +6329,6 @@ static struct ggml_tensor * llm_build_inp_embd(
        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
    } else {
 #ifdef GGML_USE_MPI
        GGML_ASSERT(false && "not implemented");
 #endif
       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
        inpL = lctx.inp_embd;
        ggml_set_input(lctx.inp_embd);
@ -11351,11 +11341,6 @@ static void llama_graph_compute(
        llama_context & lctx,
          ggml_cgraph * gf,
                  int   n_threads) {
 #ifdef GGML_USE_MPI
    const int64_t n_layer = lctx.model.hparams.n_layer;
    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
 #ifdef GGML_USE_METAL
    if (ggml_backend_is_metal(lctx.backend_metal)) {
        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@ -11370,10 +11355,6 @@ static void llama_graph_compute(
    ggml_backend_sched_graph_compute_async(lctx.sched, gf);
    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 #ifdef GGML_USE_MPI
    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif
 }
 // decode a batch of tokens by evaluating the transformer
@ -11411,12 +11392,6 @@ static int llama_decode_internal(
    }
    lctx.n_queued_tokens += n_tokens_all;
 #ifdef GGML_USE_MPI
    // TODO: needs fix after #3228
    GGML_ASSERT(false && "not implemented");
    //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
 #endif
    auto & kv_self = lctx.kv_self;
    const int64_t n_embd  = hparams.n_embd;
@ -15546,10 +15521,6 @@ void llama_backend_init(void) {
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
 #ifdef GGML_USE_MPI
    ggml_mpi_backend_init();
 #endif
 }
 void llama_numa_init(enum ggml_numa_strategy numa) {
@ -15559,9 +15530,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 }
 void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
    ggml_mpi_backend_free();
 #endif
    ggml_quantize_free();
 }
@ -15962,20 +15930,6 @@ struct llama_context * llama_new_context_with_model(
        }
    }
 #ifdef GGML_USE_MPI
    ctx->ctx_mpi = ggml_mpi_init();
    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
        // TODO: needs fix after #3228
        GGML_ASSERT(false && "not implemented");
        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
        llama_backend_free();
        exit(1);
    }
 #endif
    return ctx;
 }
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@ -5,7 +5,6 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
 set(LLAMA_BLAS @LLAMA_BLAS@)
 set(LLAMA_CUDA @LLAMA_CUDA@)
 set(LLAMA_METAL @LLAMA_METAL@)
 set(LLAMA_MPI @LLAMA_MPI@)
 set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
 set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
 set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
@ -37,10 +36,6 @@ if (LLAMA_METAL)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 if (LLAMA_MPI)
    find_package(MPI REQUIRED)
 endif()
 if (LLAMA_CLBLAST)
    find_package(CLBlast REQUIRED)
 endif()