From d05ca74dd81018dff00afbdd226c40f38c9ed2f5 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Mon, 3 Jul 2023 23:53:43 -0400 Subject: [PATCH] fix warnings, update README --- Makefile | 5 +++++ README.md | 29 +++++++++++++++++++++++++++++ ggml.c | 5 +++-- ggml.h | 4 ++-- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 03f38bdba..f3bda7b9f 100644 --- a/Makefile +++ b/Makefile @@ -149,6 +149,11 @@ ifndef LLAMA_NO_ACCELERATE endif endif # LLAMA_NO_ACCELERATE +ifdef LLAMA_MPI + CFLAGS += -DGGML_USE_MPI + CXXFLAGS += -DGGML_USE_MPI +endif # LLAMA_MPI + ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas LDFLAGS += -lopenblas diff --git a/README.md b/README.md index e890dc9c2..5f5dee6bf 100644 --- a/README.md +++ b/README.md @@ -267,6 +267,35 @@ Any value larger than 0 will offload the computation to the GPU. For example: ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1 ``` +### MPI Build + +MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine. + +First, build llama.cpp and download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines. You will need to build llama.cpp with an MPI-capable compiler, for example, + +```bash +make CC=mpicc CXX=mpicxx LLAMA_MPI=1 +``` + +Once the programs are built and the weights are downloaded on all machines, ensure password-less SSH access to each machine from the primary host. + +Next, create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost". + +Here is an example hostfile: + +``` +192.168.0.1:2 +malvolio.local:1 +``` + +The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive. + +Finally, you're ready to run a computation using `mpirun`: + +```bash +mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 +``` + ### BLAS Build Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: diff --git a/ggml.c b/ggml.c index 2f00428d3..074e63cc7 100644 --- a/ggml.c +++ b/ggml.c @@ -4652,7 +4652,7 @@ struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggm struct ggml_tensor * ggml_send_tensor( struct ggml_context * ctx, - const struct ggml_tensor *src, + struct ggml_tensor *src, int dst_rank) { struct ggml_tensor * result = ggml_new_i32(ctx, 0); @@ -4666,9 +4666,10 @@ struct ggml_tensor * ggml_send_tensor( struct ggml_tensor * ggml_recv_tensor( struct ggml_context * ctx, - const struct ggml_tensor *parent, + struct ggml_tensor *parent, struct ggml_tensor *dst, int src_rank) { + UNUSED(ctx); struct ggml_tensor * result = dst; diff --git a/ggml.h b/ggml.h index aa78f17dd..de7bd2640 100644 --- a/ggml.h +++ b/ggml.h @@ -561,11 +561,11 @@ extern "C" { GGML_API struct ggml_tensor * ggml_send_tensor( struct ggml_context * ctx, - const struct ggml_tensor *src, + struct ggml_tensor *src, int dst_rank); GGML_API struct ggml_tensor * ggml_recv_tensor( struct ggml_context * ctx, - const struct ggml_tensor *parent, + struct ggml_tensor *parent, struct ggml_tensor *dst, int src_rank);