[SYCL] Add SYCL Backend registry, device and Event Interfaces (#9705)

* implemented missing SYCL event APIs

* sycl : Added device and backend reg interfaces

* Restructured ggml-sycl.cpp
This commit is contained in:
Ouadie EL FAROUKI 2024-10-18 06:46:16 +01:00 committed by GitHub
parent 60ce97c9d8
commit 87421a23e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1492 additions and 1281 deletions

View File

@ -151,7 +151,7 @@ static std::string get_gpu_info() {
int count = ggml_backend_sycl_get_device_count(); int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
char buf[128]; char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf)); ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
id += buf; id += buf;
if (i < count - 1) { if (i < count - 1) {
id += "/"; id += "/";

View File

@ -19,6 +19,8 @@ extern "C" {
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device); GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
// devide buffer // devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
@ -29,14 +31,19 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const fl
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API void ggml_backend_sycl_get_device_description(int device,
char *description,
size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count(); GGML_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference // SYCL doesn't support registering host memory, keep here for reference
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); // GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -537,6 +537,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#endif #endif
@ -568,6 +572,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg()); register_backend(ggml_backend_metal_reg());
#endif #endif
#ifdef GGML_USE_SYCL
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg()); register_backend(ggml_backend_vk_reg());
#endif #endif
@ -581,7 +588,7 @@ struct ggml_backend_registry {
register_backend(ggml_backend_amx_reg()); register_backend(ggml_backend_amx_reg());
#endif #endif
// TODO: sycl, kompute, cann // TODO: kompute, cann
register_backend(ggml_backend_cpu_reg()); register_backend(ggml_backend_cpu_reg());
} }
@ -2254,6 +2261,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->backends[b] = backends[b]; sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) { if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]->device); sched->events[b][c] = ggml_backend_event_new(backends[b]->device);

File diff suppressed because it is too large Load Diff

View File

@ -8,9 +8,7 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_KOMPUTE)
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h" # include "ggml-kompute.h"
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
# include "ggml-cann.h" # include "ggml-cann.h"
@ -3422,9 +3420,11 @@ struct llama_lora_adapter {
static int llama_get_device_count(const llama_model & model) { static int llama_get_device_count(const llama_model & model) {
int count = (int) model.devices.size(); int count = (int) model.devices.size();
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_RPC)
count += ggml_backend_sycl_get_device_count(); count += (int) model.rpc_servers.size();
#elif defined(GGML_USE_CANN) #endif
#if defined(GGML_USE_CANN)
count += ggml_backend_cann_get_device_count(); count += ggml_backend_cann_get_device_count();
#endif #endif
@ -3445,11 +3445,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
} }
} }
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_CANN)
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CANN)
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type(); buft = ggml_backend_cann_host_buffer_type();
} }
@ -3473,9 +3469,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
} }
device -= (int)model.devices.size(); device -= (int)model.devices.size();
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_KOMPUTE)
buft = ggml_backend_sycl_buffer_type(device);
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(device); buft = ggml_backend_kompute_buffer_type(device);
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(device); buft = ggml_backend_cann_buffer_type(device);
@ -3505,12 +3499,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
} }
} }
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) { if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu); buft = llama_default_buffer_type_offload(model, fallback_gpu);
} }
@ -3528,12 +3516,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
return free; return free;
} }
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_CANN)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_CANN)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cann_get_device_memory(device, &free, &total); ggml_backend_cann_get_device_memory(device, &free, &total);
@ -19096,7 +19079,7 @@ bool llama_supports_mlock(void) {
} }
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) #if defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true; return true;
#else #else
@ -19428,29 +19411,7 @@ struct llama_context * llama_new_context_with_model(
main_gpu -= (int)model->devices.size(); main_gpu -= (int)model->devices.size();
} }
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_KOMPUTE)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(main_gpu); auto * backend = ggml_backend_kompute_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {