mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-10-29 22:20:15 +01:00
llama : remove MPI backend (#7395)
This commit is contained in:
parent
1ea2a0036e
commit
d359f30921
@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
|
|||||||
(cmakeBool "LLAMA_CUDA" useCuda)
|
(cmakeBool "LLAMA_CUDA" useCuda)
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
|
||||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||||
(cmakeBool "LLAMA_STATIC" enableStatic)
|
(cmakeBool "LLAMA_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
|
34
.github/workflows/build.yml
vendored
34
.github/workflows/build.yml
vendored
@ -306,40 +306,6 @@ jobs:
|
|||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
continue-on-error: true
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
mpi_library: [mpich, libopenmpi-dev]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential ${{ matrix.mpi_library }}
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake -DLLAMA_MPI=ON ..
|
|
||||||
cmake --build . --config Release -j $(nproc)
|
|
||||||
|
|
||||||
- name: Test
|
|
||||||
id: cmake_test
|
|
||||||
run: |
|
|
||||||
cd build
|
|
||||||
ctest -L main --verbose
|
|
||||||
|
|
||||||
ubuntu-latest-cmake-rpc:
|
ubuntu-latest-cmake-rpc:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
@ -122,7 +122,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|||||||
"llama: metal minimum macOS version")
|
"llama: metal minimum macOS version")
|
||||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
|
||||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||||
@ -466,35 +465,6 @@ if (LLAMA_CUDA)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_MPI)
|
|
||||||
cmake_minimum_required(VERSION 3.10)
|
|
||||||
find_package(MPI)
|
|
||||||
if (MPI_C_FOUND)
|
|
||||||
message(STATUS "MPI found")
|
|
||||||
|
|
||||||
set(GGML_HEADERS_MPI ggml-mpi.h)
|
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c)
|
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
add_compile_options(-Wno-cast-qual)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
|
||||||
|
|
||||||
# Even if you're only using the C header, C++ programs may bring in MPI
|
|
||||||
# C++ functions, so more linkage is needed
|
|
||||||
if (MPI_CXX_FOUND)
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(WARNING "MPI not found")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_RPC)
|
if (LLAMA_RPC)
|
||||||
add_compile_definitions(GGML_USE_RPC)
|
add_compile_definitions(GGML_USE_RPC)
|
||||||
|
|
||||||
@ -1218,7 +1188,6 @@ add_library(ggml OBJECT
|
|||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
|
||||||
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||||
@ -1306,7 +1275,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
|||||||
|
|
||||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
install(TARGETS ggml PUBLIC_HEADER)
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
12
Makefile
12
Makefile
@ -399,13 +399,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|||||||
endif
|
endif
|
||||||
endif # LLAMA_NO_ACCELERATE
|
endif # LLAMA_NO_ACCELERATE
|
||||||
|
|
||||||
ifdef LLAMA_MPI
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_MPI
|
|
||||||
MK_CFLAGS += -Wno-cast-qual
|
|
||||||
MK_CXXFLAGS += -Wno-cast-qual
|
|
||||||
OBJS += ggml-mpi.o
|
|
||||||
endif # LLAMA_MPI
|
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||||
@ -629,11 +622,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|||||||
endif
|
endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_MPI
|
|
||||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
endif # LLAMA_MPI
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
39
README.md
39
README.md
@ -382,45 +382,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
|
|||||||
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
||||||
argument.
|
argument.
|
||||||
|
|
||||||
### MPI Build
|
|
||||||
|
|
||||||
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
|
|
||||||
|
|
||||||
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
|
|
||||||
|
|
||||||
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
|
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
|
|
||||||
```
|
|
||||||
|
|
||||||
- Using `CMake`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cmake -S . -B build -DLLAMA_MPI=ON
|
|
||||||
```
|
|
||||||
|
|
||||||
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
|
|
||||||
|
|
||||||
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
|
|
||||||
|
|
||||||
Here is an example hostfile:
|
|
||||||
|
|
||||||
```
|
|
||||||
192.168.0.1:2
|
|
||||||
malvolio.local:1
|
|
||||||
```
|
|
||||||
|
|
||||||
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
|
|
||||||
|
|
||||||
Finally, you're ready to run a computation using `mpirun`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
|
||||||
```
|
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||||
|
216
ggml-mpi.c
216
ggml-mpi.c
@ -1,216 +0,0 @@
|
|||||||
#include "ggml-mpi.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#include <mpi.h>
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
|
||||||
|
|
||||||
struct ggml_mpi_context {
|
|
||||||
int rank;
|
|
||||||
int size;
|
|
||||||
};
|
|
||||||
|
|
||||||
void ggml_mpi_backend_init(void) {
|
|
||||||
MPI_Init(NULL, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_mpi_backend_free(void) {
|
|
||||||
MPI_Finalize();
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_mpi_context * ggml_mpi_init(void) {
|
|
||||||
struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
|
|
||||||
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
|
|
||||||
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_mpi_free(struct ggml_mpi_context * ctx) {
|
|
||||||
free(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
|
|
||||||
return ctx->rank;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_mpi_eval_init(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
int * n_tokens,
|
|
||||||
int * n_past,
|
|
||||||
int * n_threads) {
|
|
||||||
UNUSED(ctx_mpi);
|
|
||||||
|
|
||||||
// synchronize the worker node parameters with the root node
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
||||||
MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
||||||
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
|
|
||||||
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
|
|
||||||
if (t == NULL) {
|
|
||||||
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
|
||||||
if (gf->nodes[i] == t) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
|
|
||||||
MPI_Datatype mpi_type;
|
|
||||||
|
|
||||||
switch (t->type) {
|
|
||||||
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
|
||||||
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
|
|
||||||
GGML_ASSERT(retval == MPI_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
|
|
||||||
MPI_Datatype mpi_type;
|
|
||||||
|
|
||||||
switch (t->type) {
|
|
||||||
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
|
||||||
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
MPI_Status status; UNUSED(status);
|
|
||||||
|
|
||||||
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
|
||||||
GGML_ASSERT(retval == MPI_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: there are many improvements that can be done to this implementation
|
|
||||||
void ggml_mpi_graph_compute_pre(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
struct ggml_cgraph * gf,
|
|
||||||
int n_layers) {
|
|
||||||
const int mpi_rank = ctx_mpi->rank;
|
|
||||||
const int mpi_size = ctx_mpi->size;
|
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
|
|
||||||
if (inp_tokens == NULL) {
|
|
||||||
fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
|
|
||||||
if (inp0 == NULL) {
|
|
||||||
fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(inp0 == gf->nodes[0]);
|
|
||||||
|
|
||||||
// distribute the compute graph into slices across the MPI nodes
|
|
||||||
//
|
|
||||||
// the main node (0) processes the last layers + the remainder of the compute graph
|
|
||||||
// and is responsible to pass the input tokens to the first node (1)
|
|
||||||
//
|
|
||||||
// node 1: [( 0) * n_per_node, ( 1) * n_per_node)
|
|
||||||
// node 2: [( 1) * n_per_node, ( 2) * n_per_node)
|
|
||||||
// ...
|
|
||||||
// node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
|
|
||||||
// node 0: [(n-1) * n_per_node, n_nodes)
|
|
||||||
//
|
|
||||||
if (mpi_rank > 0) {
|
|
||||||
if (mpi_rank == 1) {
|
|
||||||
// the first node (1) receives the input tokens from the main node (0)
|
|
||||||
ggml_mpi_tensor_recv(inp_tokens, 0);
|
|
||||||
} else {
|
|
||||||
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
|
|
||||||
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
|
|
||||||
}
|
|
||||||
} else if (mpi_size > 1) {
|
|
||||||
// node 0 sends the input tokens to node 1
|
|
||||||
ggml_mpi_tensor_send(inp_tokens, 1);
|
|
||||||
|
|
||||||
// recv the output data from the last node
|
|
||||||
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
|
|
||||||
|
|
||||||
const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
|
|
||||||
|
|
||||||
const int il0 = (mpi_idx + 0) * n_per_node;
|
|
||||||
const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
|
|
||||||
|
|
||||||
char name_l0[GGML_MAX_NAME];
|
|
||||||
char name_l1[GGML_MAX_NAME];
|
|
||||||
|
|
||||||
snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
|
|
||||||
snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
|
|
||||||
|
|
||||||
const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
|
|
||||||
const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
|
|
||||||
|
|
||||||
if (idx_l0 < 0 || idx_l1 < 0) {
|
|
||||||
fprintf(stderr, "%s: layer input nodes not found\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// attach the input data to all nodes that need it
|
|
||||||
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
|
|
||||||
for (int i = idx_l0; i < idx_l1; i++) {
|
|
||||||
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
|
|
||||||
gf->nodes[i]->src[0] = inp0;
|
|
||||||
}
|
|
||||||
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
|
|
||||||
gf->nodes[i]->src[1] = inp0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
|
|
||||||
for (int i = 1; i < idx_l1 - idx_l0; i++) {
|
|
||||||
gf->nodes[i] = gf->nodes[idx_l0 + i];
|
|
||||||
gf->grads[i] = gf->grads[idx_l0 + i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
|
|
||||||
if (mpi_idx != 0) {
|
|
||||||
gf->nodes[0]->op = GGML_OP_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
gf->n_nodes = idx_l1 - idx_l0;
|
|
||||||
|
|
||||||
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_mpi_graph_compute_post(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
struct ggml_cgraph * gf,
|
|
||||||
int n_layers) {
|
|
||||||
UNUSED(n_layers);
|
|
||||||
|
|
||||||
const int mpi_rank = ctx_mpi->rank;
|
|
||||||
const int mpi_size = ctx_mpi->size;
|
|
||||||
|
|
||||||
// send the output data to the next node
|
|
||||||
if (mpi_rank > 0) {
|
|
||||||
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
|
|
||||||
}
|
|
||||||
}
|
|
39
ggml-mpi.h
39
ggml-mpi.h
@ -1,39 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
struct ggml_context;
|
|
||||||
struct ggml_tensor;
|
|
||||||
struct ggml_cgraph;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ggml_mpi_context;
|
|
||||||
|
|
||||||
void ggml_mpi_backend_init(void);
|
|
||||||
void ggml_mpi_backend_free(void);
|
|
||||||
|
|
||||||
struct ggml_mpi_context * ggml_mpi_init(void);
|
|
||||||
void ggml_mpi_free(struct ggml_mpi_context * ctx);
|
|
||||||
|
|
||||||
int ggml_mpi_rank(struct ggml_mpi_context * ctx);
|
|
||||||
|
|
||||||
void ggml_mpi_eval_init(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
int * n_tokens,
|
|
||||||
int * n_past,
|
|
||||||
int * n_threads);
|
|
||||||
|
|
||||||
void ggml_mpi_graph_compute_pre(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
struct ggml_cgraph * gf,
|
|
||||||
int n_layers);
|
|
||||||
|
|
||||||
void ggml_mpi_graph_compute_post(
|
|
||||||
struct ggml_mpi_context * ctx_mpi,
|
|
||||||
struct ggml_cgraph * gf,
|
|
||||||
int n_layers);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
48
llama.cpp
48
llama.cpp
@ -26,9 +26,6 @@
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
# include "ggml-metal.h"
|
# include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
# include "ggml-mpi.h"
|
|
||||||
#endif
|
|
||||||
#ifndef QK_K
|
#ifndef QK_K
|
||||||
# ifdef GGML_QKK_64
|
# ifdef GGML_QKK_64
|
||||||
# define QK_K 64
|
# define QK_K 64
|
||||||
@ -2270,10 +2267,6 @@ struct llama_context {
|
|||||||
|
|
||||||
// control vectors
|
// control vectors
|
||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||||
@ -6336,10 +6329,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
} else {
|
} else {
|
||||||
#ifdef GGML_USE_MPI
|
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||||
GGML_ASSERT(false && "not implemented");
|
|
||||||
#endif
|
|
||||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
||||||
inpL = lctx.inp_embd;
|
inpL = lctx.inp_embd;
|
||||||
ggml_set_input(lctx.inp_embd);
|
ggml_set_input(lctx.inp_embd);
|
||||||
}
|
}
|
||||||
@ -11351,11 +11341,6 @@ static void llama_graph_compute(
|
|||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
int n_threads) {
|
int n_threads) {
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
const int64_t n_layer = lctx.model.hparams.n_layer;
|
|
||||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||||
@ -11370,10 +11355,6 @@ static void llama_graph_compute(
|
|||||||
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
||||||
|
|
||||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode a batch of tokens by evaluating the transformer
|
// decode a batch of tokens by evaluating the transformer
|
||||||
@ -11411,12 +11392,6 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
lctx.n_queued_tokens += n_tokens_all;
|
lctx.n_queued_tokens += n_tokens_all;
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
// TODO: needs fix after #3228
|
|
||||||
GGML_ASSERT(false && "not implemented");
|
|
||||||
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
auto & kv_self = lctx.kv_self;
|
auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
@ -15546,10 +15521,6 @@ void llama_backend_init(void) {
|
|||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_backend_init();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_numa_init(enum ggml_numa_strategy numa) {
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||||
@ -15559,9 +15530,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_backend_free(void) {
|
void llama_backend_free(void) {
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_backend_free();
|
|
||||||
#endif
|
|
||||||
ggml_quantize_free();
|
ggml_quantize_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -15962,20 +15930,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ctx->ctx_mpi = ggml_mpi_init();
|
|
||||||
|
|
||||||
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
|
||||||
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
|
||||||
// TODO: needs fix after #3228
|
|
||||||
GGML_ASSERT(false && "not implemented");
|
|
||||||
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
|
||||||
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
|
||||||
llama_backend_free();
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
|||||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||||
set(LLAMA_CUDA @LLAMA_CUDA@)
|
set(LLAMA_CUDA @LLAMA_CUDA@)
|
||||||
set(LLAMA_METAL @LLAMA_METAL@)
|
set(LLAMA_METAL @LLAMA_METAL@)
|
||||||
set(LLAMA_MPI @LLAMA_MPI@)
|
|
||||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||||
@ -37,10 +36,6 @@ if (LLAMA_METAL)
|
|||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_MPI)
|
|
||||||
find_package(MPI REQUIRED)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
if (LLAMA_CLBLAST)
|
||||||
find_package(CLBlast REQUIRED)
|
find_package(CLBlast REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
Loading…
Reference in New Issue
Block a user