mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-23 18:09:18 +01:00
llama : offload to RPC in addition to other backends (#7640)
* llama : offload to RPC in addition to other backends * - fix copy_tensor being called on the src buffer instead of the dst buffer - always initialize views in the view_src buffer - add RPC backend to Makefile build - add endpoint to all RPC object names * add rpc-server to Makefile * Update llama.cpp Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
a5735e4426
commit
bde7cd3cd9
29
Makefile
29
Makefile
@ -69,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_RPC
|
||||||
|
BUILD_TARGETS += rpc-server
|
||||||
|
endif
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
test: $(TEST_TARGETS)
|
test: $(TEST_TARGETS)
|
||||||
@ -429,6 +433,11 @@ ifdef LLAMA_BLIS
|
|||||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||||
endif # LLAMA_BLIS
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
|
ifdef LLAMA_RPC
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
|
OBJS += ggml-rpc.o
|
||||||
|
endif # LLAMA_RPC
|
||||||
|
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
||||||
LLAMA_CUDA := 1
|
LLAMA_CUDA := 1
|
||||||
@ -654,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|||||||
endif
|
endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||||
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||||
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_RPC
|
||||||
|
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
endif # LLAMA_RPC
|
||||||
|
|
||||||
GF_CC := $(CC)
|
GF_CC := $(CC)
|
||||||
include scripts/get-flags.mk
|
include scripts/get-flags.mk
|
||||||
|
|
||||||
@ -738,14 +762,9 @@ unicode.o: unicode.cpp unicode.h
|
|||||||
unicode-data.o: unicode-data.cpp unicode-data.h
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
||||||
|
|
||||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
|
||||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
|
||||||
|
|
||||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
ggml_backend_view_init(tensor);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (tensor->data == NULL) {
|
if (tensor->data == NULL) {
|
||||||
@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|||||||
if (t->view_src == NULL) {
|
if (t->view_src == NULL) {
|
||||||
ggml_tallocr_alloc(&tallocr, t);
|
ggml_tallocr_alloc(&tallocr, t);
|
||||||
} else if (t->buffer == NULL) {
|
} else if (t->buffer == NULL) {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(t);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (t->view_src != NULL && t->buffer == NULL) {
|
if (t->view_src != NULL && t->buffer == NULL) {
|
||||||
// view of a pre-allocated tensor
|
// view of a pre-allocated tensor
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||||
if (dst_buf->iface.cpy_tensor) {
|
if (dst_buf->iface.cpy_tensor) {
|
||||||
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
|
||||||
tensor->buffer = buffer;
|
tensor->buffer = tensor->view_src->buffer;
|
||||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||||
@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|||||||
struct ggml_tensor * dst = node_copies[id];
|
struct ggml_tensor * dst = node_copies[id];
|
||||||
if (dst->view_src != NULL) {
|
if (dst->view_src != NULL) {
|
||||||
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
ggml_backend_view_init(dst);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ggml_backend_tensor_copy(src, dst);
|
ggml_backend_tensor_copy(src, dst);
|
||||||
|
@ -225,7 +225,7 @@ extern "C" {
|
|||||||
|
|
||||||
// Tensor initialization
|
// Tensor initialization
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|||||||
if (remote_ptr != 0) {
|
if (remote_ptr != 0) {
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||||
ggml_backend_rpc_buffer_interface,
|
ggml_backend_rpc_buffer_interface,
|
||||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
||||||
remote_size);
|
remote_size);
|
||||||
return buffer;
|
return buffer;
|
||||||
} else {
|
} else {
|
||||||
@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
|||||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||||
/* .endpoint = */ endpoint,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC",
|
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_t backend = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
|
88
llama.cpp
88
llama.cpp
@ -2371,13 +2371,34 @@ struct llama_context {
|
|||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static size_t llama_get_device_count(const llama_model & model) {
|
||||||
|
size_t count = 1;
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
|
count = ggml_backend_cuda_get_device_count();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
count = ggml_backend_sycl_get_device_count();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
count = ggml_backend_vk_get_device_count();
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
count += model.rpc_servers.size();
|
||||||
|
#endif
|
||||||
|
return count;
|
||||||
|
GGML_UNUSED(model);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
#if defined(GGML_USE_RPC)
|
||||||
std::string endpoint = model.rpc_servers[gpu];
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
#elif defined(GGML_USE_METAL)
|
if (gpu >= dev_count - rpc_count) {
|
||||||
|
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
||||||
|
return ggml_backend_rpc_buffer_type(endpoint);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|||||||
GGML_UNUSED(tensor_split);
|
GGML_UNUSED(tensor_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_count(const llama_model & model) {
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
return model.rpc_servers.size();
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
return ggml_backend_cuda_get_device_count();
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
return ggml_backend_sycl_get_device_count();
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
return ggml_backend_vk_get_device_count();
|
|
||||||
#else
|
|
||||||
return 1;
|
|
||||||
#endif
|
|
||||||
GGML_UNUSED(model);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
size_t total;
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
size_t free;
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
std::string endpoint = model.rpc_servers[device];
|
if (device >= dev_count - rpc_count) {
|
||||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
size_t total;
|
||||||
return free;
|
size_t free;
|
||||||
#elif defined(GGML_USE_CUDA)
|
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
||||||
|
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||||
|
return free;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||||
@ -16160,7 +16171,7 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (params.rpc_servers != nullptr) {
|
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||||
// split the servers set them into model->rpc_servers
|
// split the servers set them into model->rpc_servers
|
||||||
std::string servers(params.rpc_servers);
|
std::string servers(params.rpc_servers);
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
@ -16323,17 +16334,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// initialize backends
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_METAL)
|
||||||
for (auto & server : model->rpc_servers) {
|
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ctx->backend_metal = ggml_backend_metal_init();
|
ctx->backend_metal = ggml_backend_metal_init();
|
||||||
if (ctx->backend_metal == nullptr) {
|
if (ctx->backend_metal == nullptr) {
|
||||||
@ -16425,6 +16426,19 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
for (const auto & endpoint : model->rpc_servers) {
|
||||||
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
Loading…
Reference in New Issue
Block a user