diff --git a/Makefile b/Makefile index 937b195b8..b250debd4 100644 --- a/Makefile +++ b/Makefile @@ -148,8 +148,12 @@ ifndef LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE ifdef LLAMA_MPI - CFLAGS += -DGGML_USE_MPI -Wno-cast-qual -Wno-int-to-void-pointer-cast -Wno-void-pointer-to-int-cast + CFLAGS += -DGGML_USE_MPI -Wno-cast-qual CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual + OBJS += ggml-mpi.o + +ggml-mpi.o: ggml-mpi.c ggml-mpi.h + $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI ifdef LLAMA_OPENBLAS diff --git a/ggml-mpi.c b/ggml-mpi.c new file mode 100644 index 000000000..bf301d08b --- /dev/null +++ b/ggml-mpi.c @@ -0,0 +1,81 @@ +#include "ggml-mpi.h" + +#include "ggml.h" + +#include +#include +#include +#define UNUSED GGML_UNUSED + +struct ggml_mpi_tensor_info { + int rank; +}; + +// ggml_compute_forward_send + +static void ggml_mpi_compute_forward_send( + struct ggml_tensor * src, + const struct ggml_tensor * orig) { + UNUSED(orig); + GGML_ASSERT(src->type == GGML_TYPE_F32); + + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + int dst_rank = ((struct ggml_mpi_tensor_info *)src->extra)->rank; + // fprintf(stderr, "(%d) Sending to (%d)\n", my_rank, (int)dst->extra); + int retval = MPI_Send(src->data, ggml_nelements(src), MPI_FLOAT, dst_rank, 0, MPI_COMM_WORLD); + // fprintf(stderr, "(%d) Sent to (%d)\n", my_rank, (int)dst->extra); + GGML_ASSERT(retval == MPI_SUCCESS); +} + +// ggml_compute_forward_recv + +static void ggml_mpi_compute_forward_recv( + struct ggml_tensor * dst, + const struct ggml_tensor * orig, + const struct ggml_tensor * parent) { + UNUSED(parent); + UNUSED(orig); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + MPI_Status status; + + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + int src_rank = ((struct ggml_mpi_tensor_info *)dst->extra)->rank; + // fprintf(stderr, "(%d) Receiving from (%d)\n", my_rank, src_extra); + int retval = MPI_Recv(dst->data, ggml_nelements(dst), MPI_FLOAT, src_rank, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + // fprintf(stderr, "(%d) Received from (%d)\n", my_rank, src_extra); + GGML_ASSERT(retval == MPI_SUCCESS); +} + +struct ggml_tensor * ggml_mpi_send_tensor( + struct ggml_context * ctx, + struct ggml_tensor *src, + int dst_rank) { + + struct ggml_tensor * result = ggml_map_custom1_inplace_f32(ctx, src, ggml_mpi_compute_forward_send); + + // TODO how/when to free this struct? + struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info)); + info->rank = dst_rank; + result->extra = info; + + return result; +} + +struct ggml_tensor * ggml_mpi_recv_tensor( + struct ggml_context * ctx, + struct ggml_tensor *parent, + struct ggml_tensor *dst, + int src_rank) { + struct ggml_tensor * result = ggml_map_custom2_inplace_f32(ctx, dst, parent, ggml_mpi_compute_forward_recv); + + // TODO how/when to free this struct? + struct ggml_mpi_tensor_info *info = calloc(1, sizeof(struct ggml_mpi_tensor_info)); + info->rank = src_rank; + result->extra = info; + + return result; +} diff --git a/ggml-mpi.h b/ggml-mpi.h new file mode 100644 index 000000000..ef5269dc5 --- /dev/null +++ b/ggml-mpi.h @@ -0,0 +1,22 @@ +#pragma once + +struct ggml_context; +struct ggml_tensor; + +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_tensor * ggml_mpi_send_tensor( + struct ggml_context * ctx, + struct ggml_tensor *src, + int dst_rank); +struct ggml_tensor * ggml_mpi_recv_tensor( + struct ggml_context * ctx, + struct ggml_tensor *parent, + struct ggml_tensor *dst, + int src_rank); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 99b7b75a8..d257c3d65 100644 --- a/ggml.c +++ b/ggml.c @@ -26,10 +26,6 @@ #include #include -#ifdef GGML_USE_MPI -#include -#endif - #ifdef GGML_USE_METAL #include #endif @@ -4688,36 +4684,6 @@ struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggm return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); } -struct ggml_tensor * ggml_send_tensor( - struct ggml_context * ctx, - struct ggml_tensor *src, - int dst_rank) { - - struct ggml_tensor * result = ggml_new_i32(ctx, 0); - - result->op = GGML_OP_SEND; - result->src0 = src; - result->extra = (void *)dst_rank; - - return result; -} - -struct ggml_tensor * ggml_recv_tensor( - struct ggml_context * ctx, - struct ggml_tensor *parent, - struct ggml_tensor *dst, - int src_rank) { - UNUSED(ctx); - - struct ggml_tensor * result = dst; - - result->op = GGML_OP_RECV; - result->src0 = parent; // just used for graph computation - result->extra = (void *)src_rank; - - return result; -} - struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { memset(tensor->data, 0, ggml_nbytes(tensor)); return tensor; @@ -8323,52 +8289,6 @@ static void ggml_compute_forward_dup( } } -// ggml_compute_forward_recv - -static void ggml_compute_forward_recv( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - return; - } - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#ifdef GGML_USE_MPI - MPI_Status status; - int my_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - // fprintf(stderr, "(%d) Receiving from (%d)\n", my_rank, (int)dst->extra); - int retval = MPI_Recv(dst->data, dst->ne[0] * dst->ne[1], MPI_FLOAT, (int)dst->extra, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - // fprintf(stderr, "(%d) Received from (%d)\n", my_rank, (int)dst->extra); - GGML_ASSERT(retval == MPI_SUCCESS); -#else - GGML_ASSERT(false); -#endif -} - -// ggml_compute_forward_send - -static void ggml_compute_forward_send( - const struct ggml_compute_params * params, - struct ggml_tensor * src, - struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - return; - } - GGML_ASSERT(src->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_I32); -#ifdef GGML_USE_MPI - int my_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - // fprintf(stderr, "(%d) Sending to (%d)\n", my_rank, (int)dst->extra); - int retval = MPI_Send(src->data, src->ne[0] * src->ne[1], MPI_FLOAT, (int)dst->extra, 0, MPI_COMM_WORLD); - // fprintf(stderr, "(%d) Sent to (%d)\n", my_rank, (int)dst->extra); - ggml_set_i32(dst, retval); - GGML_ASSERT(retval == MPI_SUCCESS); -#else - GGML_ASSERT(false); -#endif -} - // ggml_compute_forward_add static void ggml_compute_forward_add_f32( @@ -14655,14 +14575,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_dup(params, tensor->src0, tensor); } break; - case GGML_OP_SEND: - { - ggml_compute_forward_send(params, tensor->src0, tensor); - } break; - case GGML_OP_RECV: - { - ggml_compute_forward_recv(params, tensor); - } break; case GGML_OP_ADD: { ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); @@ -14961,14 +14873,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case GGML_OP_SEND: - { - GGML_ASSERT(false); // TODO: not implemented - } break; - case GGML_OP_RECV: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_ADD: { if (src0->grad) { @@ -16307,8 +16211,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = 1; } break; - case GGML_OP_SEND: - case GGML_OP_RECV: case GGML_OP_SET: case GGML_OP_CONT: case GGML_OP_RESHAPE: diff --git a/ggml.h b/ggml.h index f204d1342..d0710c555 100644 --- a/ggml.h +++ b/ggml.h @@ -381,9 +381,6 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_COUNT, - - GGML_OP_SEND, - GGML_OP_RECV, }; @@ -587,16 +584,6 @@ extern "C" { GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); - GGML_API struct ggml_tensor * ggml_send_tensor( - struct ggml_context * ctx, - struct ggml_tensor *src, - int dst_rank); - GGML_API struct ggml_tensor * ggml_recv_tensor( - struct ggml_context * ctx, - struct ggml_tensor *parent, - struct ggml_tensor *dst, - int src_rank); - GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); diff --git a/llama.cpp b/llama.cpp index 99abde348..42b2f6155 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,9 @@ #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif +#ifdef GGML_USE_MPI +#include "ggml-mpi.h" +#endif #ifdef GGML_USE_K_QUANTS #ifndef QK_K #ifdef GGML_QKK_64 @@ -1332,10 +1335,10 @@ static bool llama_eval_internal( if (lctx.mpi_rank > 0) { #ifdef GGML_USE_MPI - inpL = ggml_recv_tensor(ctx0, NULL, + inpL = ggml_mpi_recv_tensor(ctx0, NULL, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N), lctx.mpi_rank-1); - ggml_set_name(inpL, "recv"); + ggml_set_name(inpL, "mpi_recv"); #else GGML_ASSERT(false); #endif @@ -1591,15 +1594,23 @@ static bool llama_eval_internal( struct ggml_tensor * embeddings = NULL; if (lctx.mpi_size > 1) { - cur = ggml_send_tensor(ctx0, cur, (lctx.mpi_rank+1)%lctx.mpi_size); - ggml_set_name(cur, "send"); +#ifdef GGML_USE_MPI + cur = ggml_mpi_send_tensor(ctx0, cur, (lctx.mpi_rank+1)%lctx.mpi_size); + ggml_set_name(cur, "mpi_send"); +#else + GGML_ASSERT(false); +#endif } if (lctx.mpi_rank == 0) { if (lctx.mpi_size > 1) { - cur = ggml_recv_tensor(ctx0, cur, +#ifdef GGML_USE_MPI + cur = ggml_mpi_recv_tensor(ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N), lctx.mpi_size-1); - ggml_set_name(cur, "recv"); + ggml_set_name(cur, "mpi_recv"); +#else + GGML_ASSERT(false); +#endif } // norm { @@ -3504,14 +3515,6 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } -int llama_mpi_rank(const struct llama_context * ctx) { - return ctx->mpi_rank; -} - -int llama_mpi_size(const struct llama_context * ctx) { - return ctx->mpi_size; -} - int llama_get_vocab( const struct llama_context * ctx, const char * * strings, diff --git a/llama.h b/llama.h index 14bc432c7..b90c52355 100644 --- a/llama.h +++ b/llama.h @@ -273,8 +273,6 @@ extern "C" { LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); - LLAMA_API int llama_mpi_rank (const struct llama_context * ctx); - LLAMA_API int llama_mpi_size (const struct llama_context * ctx); // Get the vocabulary as output parameters. // Returns number of results.