mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 22:08:55 +01:00
llama : refactor model loader with backend registry (#10026)
This commit is contained in:
parent
8f275a7c45
commit
c5b0f4b5d9
@ -21,12 +21,6 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#include "ggml-sycl.h"
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
#include "ggml-cann.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_cpu_info() {
|
static std::string get_cpu_info() {
|
||||||
std::string id;
|
std::vector<std::string> cpu_list;
|
||||||
#ifdef __linux__
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
if (f) {
|
auto dev_type = ggml_backend_dev_type(dev);
|
||||||
char buf[1024];
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
||||||
while (fgets(buf, sizeof(buf), f)) {
|
cpu_list.push_back(ggml_backend_dev_description(dev));
|
||||||
if (strncmp(buf, "model name", 10) == 0) {
|
|
||||||
char * p = strchr(buf, ':');
|
|
||||||
if (p) {
|
|
||||||
p++;
|
|
||||||
while (std::isspace(*p)) {
|
|
||||||
p++;
|
|
||||||
}
|
|
||||||
while (std::isspace(p[strlen(p) - 1])) {
|
|
||||||
p[strlen(p) - 1] = '\0';
|
|
||||||
}
|
|
||||||
id = p;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(f);
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
HKEY hKey;
|
|
||||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
||||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
||||||
0,
|
|
||||||
KEY_READ,
|
|
||||||
&hKey) != ERROR_SUCCESS) {
|
|
||||||
// fail to open registry key
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
char cpu_brand[256];
|
|
||||||
DWORD cpu_brand_size = sizeof(cpu_brand);
|
|
||||||
if (RegQueryValueExA(hKey,
|
|
||||||
TEXT("ProcessorNameString"),
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
(LPBYTE)cpu_brand,
|
|
||||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
||||||
id.assign(cpu_brand, cpu_brand_size);
|
|
||||||
if (id.find('\0') != std::string::npos) {
|
|
||||||
id.resize(id.find('\0'));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
RegCloseKey(hKey);
|
return join(cpu_list, ", ");
|
||||||
#endif
|
|
||||||
// TODO: other platforms
|
|
||||||
return id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::vector<std::string> gpu_list;
|
||||||
#ifdef GGML_USE_CUDA
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
int count = ggml_backend_cuda_get_device_count();
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
for (int i = 0; i < count; i++) {
|
auto dev_type = ggml_backend_dev_type(dev);
|
||||||
char buf[128];
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||||
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
gpu_list.push_back(ggml_backend_dev_description(dev));
|
||||||
id += buf;
|
|
||||||
if (i < count - 1) {
|
|
||||||
id += "/";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
return join(gpu_list, ", ");
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
int count = ggml_backend_sycl_get_device_count();
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
char buf[128];
|
|
||||||
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
|
|
||||||
id += buf;
|
|
||||||
if (i < count - 1) {
|
|
||||||
id += "/";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
uint32_t count = ggml_backend_cann_get_device_count();
|
|
||||||
for (uint32_t i = 0; i < count; i++) {
|
|
||||||
char buf[128];
|
|
||||||
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
|
|
||||||
id += buf;
|
|
||||||
if (i < count - 1) {
|
|
||||||
id += "/";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// TODO: other backends
|
|
||||||
return id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// command line params
|
// command line params
|
||||||
@ -938,29 +864,15 @@ struct test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_backend() {
|
static std::string get_backend() {
|
||||||
if (cuda) {
|
std::vector<std::string> backends;
|
||||||
return GGML_CUDA_NAME;
|
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||||
|
auto * reg = ggml_backend_reg_get(i);
|
||||||
|
std::string name = ggml_backend_reg_name(reg);
|
||||||
|
if (name != "CPU") {
|
||||||
|
backends.push_back(ggml_backend_reg_name(reg));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (vulkan) {
|
return backends.empty() ? "CPU" : join(backends, ",");
|
||||||
return "Vulkan";
|
|
||||||
}
|
|
||||||
if (kompute) {
|
|
||||||
return "Kompute";
|
|
||||||
}
|
|
||||||
if (metal) {
|
|
||||||
return "Metal";
|
|
||||||
}
|
|
||||||
if (sycl) {
|
|
||||||
return GGML_SYCL_NAME;
|
|
||||||
}
|
|
||||||
if (gpu_blas) {
|
|
||||||
return "GPU BLAS";
|
|
||||||
}
|
|
||||||
if (blas) {
|
|
||||||
return "BLAS";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "CPU";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
|
@ -114,11 +114,12 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
|
|
||||||
enum ggml_backend_dev_type {
|
enum ggml_backend_dev_type {
|
||||||
|
// CPU device using system memory
|
||||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||||
|
// GPU device using dedicated memory
|
||||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||||
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||||
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||||
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// functionality supported by the device
|
// functionality supported by the device
|
||||||
@ -167,10 +168,14 @@ extern "C" {
|
|||||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||||
|
|
||||||
|
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||||
|
|
||||||
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
// Split buffer type for tensor parallelism
|
||||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
|
// Set the number of threads for the backend
|
||||||
|
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||||
|
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||||
|
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
@ -192,7 +197,7 @@ extern "C" {
|
|||||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
|||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
@ -16,12 +16,6 @@
|
|||||||
#if defined(__AMX_INT8__)
|
#if defined(__AMX_INT8__)
|
||||||
|
|
||||||
// AMX buffer interface
|
// AMX buffer interface
|
||||||
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return "AMX";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
}
|
}
|
||||||
@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_amx_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
|||||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||||
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .device = */ NULL,
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
return ggml_backend_amx_buffer_type();
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
||||||
|
|
||||||
@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
|
|||||||
static struct ggml_backend_i ggml_backend_amx_i = {
|
static struct ggml_backend_i ggml_backend_amx_i = {
|
||||||
/* .get_name = */ ggml_backend_amx_name,
|
/* .get_name = */ ggml_backend_amx_name,
|
||||||
/* .free = */ ggml_backend_amx_free,
|
/* .free = */ ggml_backend_amx_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
||||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ extern "C" {
|
|||||||
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
||||||
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
||||||
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||||
// (optional) check if tensor data is in host memory (defaults to false)
|
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
|
||||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -37,7 +37,6 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
|
||||||
// (optional) free the buffer
|
// (optional) free the buffer
|
||||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
// base address of the buffer
|
// base address of the buffer
|
||||||
@ -88,19 +87,16 @@ extern "C" {
|
|||||||
|
|
||||||
void (*free)(ggml_backend_t backend);
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
// Will be moved to the device interface
|
|
||||||
// buffer allocation
|
|
||||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// (optional) asynchronous tensor data access
|
// (optional) asynchronous tensor data access
|
||||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
// (optional) complete all pending operations
|
// (optional) complete all pending operations (required if the backend supports async operations)
|
||||||
void (*synchronize)(ggml_backend_t backend);
|
void (*synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// (optional) compute graph with a plan (not used currently)
|
// (optional) graph plans (not used currently)
|
||||||
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||||
@ -111,13 +107,6 @@ extern "C" {
|
|||||||
// compute graph (always async if supported by the backend)
|
// compute graph (always async if supported by the backend)
|
||||||
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
|
||||||
// new backends should implement the device interface instead
|
|
||||||
// These functions are being moved to the device interface
|
|
||||||
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
||||||
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
// (optional) event synchronization
|
// (optional) event synchronization
|
||||||
// record an event on this stream
|
// record an event on this stream
|
||||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
|
@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
// return a dummy buffer for zero-sized allocations
|
||||||
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||||
|
}
|
||||||
|
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name(buffer);
|
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
// get_base is optional if the buffer is zero-sized
|
||||||
|
if (buffer->size == 0) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
// clear is optional if the buffer is zero-sized
|
||||||
|
if (buffer->size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer->iface.clear(buffer, value);
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
||||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
||||||
buffer->iface.clear(buffer, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return backend->iface.get_default_buffer_type(backend);
|
return ggml_backend_dev_buffer_type(backend->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||||
@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|||||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
|
if (size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
|
||||||
if (!size) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
|
if (size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
|
||||||
if (!size) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
if (size == 0) {
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
|
|
||||||
if (!size) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
||||||
|
|
||||||
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
||||||
}
|
}
|
||||||
@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
// helper to ease transition to device interface
|
return ggml_backend_dev_supports_op(backend->device, op);
|
||||||
if (backend->device) {
|
|
||||||
return ggml_backend_dev_supports_op(backend->device, op);
|
|
||||||
}
|
|
||||||
|
|
||||||
return backend->iface.supports_op(backend, op);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
// helper to ease transition to device interface
|
return ggml_backend_dev_supports_buft(backend->device, buft);
|
||||||
if (backend->device) {
|
|
||||||
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
||||||
}
|
|
||||||
return backend->iface.supports_buft(backend, buft);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
// helper to ease transition to device interface
|
return ggml_backend_dev_offload_op(backend->device, op);
|
||||||
if (backend->device) {
|
|
||||||
return ggml_backend_dev_offload_op(backend->device, op);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (backend->iface.offload_op != NULL) {
|
|
||||||
return backend->iface.offload_op(backend, op);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
||||||
@ -582,6 +579,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
register_backend(ggml_backend_vk_reg());
|
register_backend(ggml_backend_vk_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_CANN
|
||||||
|
register_backend(ggml_backend_cann_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
register_backend(ggml_backend_blas_reg());
|
register_backend(ggml_backend_blas_reg());
|
||||||
#endif
|
#endif
|
||||||
@ -591,9 +591,6 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_AMX
|
#ifdef GGML_USE_AMX
|
||||||
register_backend(ggml_backend_amx_reg());
|
register_backend(ggml_backend_amx_reg());
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
register_backend(ggml_backend_cann_reg());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// TODO: kompute
|
// TODO: kompute
|
||||||
|
|
||||||
@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_init_best(void) {
|
ggml_backend_t ggml_backend_init_best(void) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
}
|
}
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|||||||
return ggml_backend_dev_init(dev, NULL);
|
return ggml_backend_dev_init(dev, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend CPU
|
// CPU backend - buffer
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return "CPU";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
uintptr_t data = (uintptr_t)buffer->context;
|
uintptr_t data = (uintptr_t)buffer->context;
|
||||||
@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||||
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||||
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
||||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|||||||
/* .reset = */ NULL,
|
/* .reset = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// CPU backend - buffer type
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
auto alloc_size = size;
|
void * data = ggml_aligned_malloc(size);
|
||||||
if (alloc_size == 0) {
|
|
||||||
alloc_size = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void * data = ggml_aligned_malloc(alloc_size);
|
|
||||||
|
|
||||||
if (data == NULL) {
|
if (data == NULL) {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
|
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|||||||
return &ggml_backend_cpu_buffer_type;
|
return &ggml_backend_cpu_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CPU_Mapped";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_buffer_type;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
// buffer type HBM
|
// buffer type HBM
|
||||||
@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
|
||||||
return "CPU_HBM";
|
|
||||||
|
|
||||||
GGML_UNUSED(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
hbw_free(buffer->context);
|
hbw_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
//void * ptr = hbw_malloc(size);
|
|
||||||
void * ptr;
|
void * ptr;
|
||||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
|
||||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||||
|
static ggml_backend_buffer_type_t bufts[] = {
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
ggml_backend_cpu_hbm_buffer_type(),
|
||||||
|
#endif
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
return bufts;
|
||||||
|
|
||||||
|
GGML_UNUSED(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPU backend - backend (stream)
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
ggml_threadpool_t threadpool;
|
ggml_threadpool_t threadpool;
|
||||||
@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu {
|
struct ggml_backend_plan_cpu {
|
||||||
struct ggml_cplan cplan;
|
struct ggml_cplan cplan;
|
||||||
struct ggml_cgraph cgraph;
|
struct ggml_cgraph cgraph;
|
||||||
@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
|||||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||||
/* .free = */ ggml_backend_cpu_free,
|
/* .free = */ ggml_backend_cpu_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
|||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////
|
// CPU backend - device
|
||||||
|
|
||||||
struct ggml_backend_cpu_device_context {
|
struct ggml_backend_cpu_device_context {
|
||||||
std::string description = "CPU";
|
std::string description = "CPU";
|
||||||
@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||||
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||||
return ggml_backend_cpu_init();
|
return ggml_backend_cpu_init();
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
|
|||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|||||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||||
/* .init_backend = */ ggml_backend_cpu_device_init,
|
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||||
/* .get_host_buffer_type = */ NULL,
|
/* .get_host_buffer_type = */ NULL,
|
||||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|||||||
/* .event_synchronize = */ NULL,
|
/* .event_synchronize = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////
|
// CPU backend - backend (reg)
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
|||||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||||
return (void *)ggml_backend_cpu_set_n_threads;
|
return (void *)ggml_backend_cpu_set_n_threads;
|
||||||
}
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||||
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
|
|||||||
size_t n_buffers;
|
size_t n_buffers;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
||||||
|
|
||||||
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||||
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
||||||
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
||||||
/* .get_base = */ NULL,
|
/* .get_base = */ NULL,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
|
|||||||
char * context_buffer;
|
char * context_buffer;
|
||||||
size_t context_buffer_size;
|
size_t context_buffer_size;
|
||||||
|
|
||||||
bool debug;
|
int debug;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||||
@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
||||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
||||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||||
@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
||||||
|
// not an ideal solution
|
||||||
|
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||||
// check if a backend with higher prio wants to offload the op
|
// check if a backend with higher prio wants to offload the op
|
||||||
if (src_backend_id == sched->n_backends - 1) {
|
if (src_backend_id == sched->n_backends - 1) {
|
||||||
@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
if (sched->debug > 1) {
|
||||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||||
struct ggml_tensor * src = node->src[j];
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (src == NULL) {
|
struct ggml_tensor * src = node->src[j];
|
||||||
continue;
|
if (src == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
||||||
|
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||||
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
||||||
}
|
}
|
||||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
GGML_LOG_DEBUG("\n");
|
||||||
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
||||||
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
||||||
}
|
}
|
||||||
GGML_LOG_DEBUG("\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// check if a weight is on a different backend
|
// check if a weight is on a different and incompatible backend
|
||||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_backend_id != cur_backend_id) {
|
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||||
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||||||
|
|
||||||
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
||||||
|
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
|
@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||||
|
|
||||||
@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||||||
static struct ggml_backend_i blas_backend_i = {
|
static struct ggml_backend_i blas_backend_i = {
|
||||||
/* .get_name = */ ggml_backend_blas_get_name,
|
/* .get_name = */ ggml_backend_blas_get_name,
|
||||||
/* .free = */ ggml_backend_blas_free,
|
/* .free = */ ggml_backend_blas_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
||||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
|
static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||||
return ggml_backend_blas_init();
|
return ggml_backend_blas_init();
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
|
|||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
|||||||
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
||||||
/* .get_type = */ ggml_backend_blas_device_get_type,
|
/* .get_type = */ ggml_backend_blas_device_get_type,
|
||||||
/* .get_props = */ ggml_backend_blas_device_get_props,
|
/* .get_props = */ ggml_backend_blas_device_get_props,
|
||||||
/* .init_backend = */ ggml_backend_blas_device_init,
|
/* .init_backend = */ ggml_backend_blas_device_init_backend,
|
||||||
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
||||||
/* .get_host_buffer_type = */ NULL,
|
/* .get_host_buffer_type = */ NULL,
|
||||||
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
|
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
|
||||||
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
|
@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
|
|||||||
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Retrieve the name associated with a CANN buffer.
|
|
||||||
*
|
|
||||||
* This function returns the name of a CANN buffer, which is stored in the
|
|
||||||
* context of the buffer.
|
|
||||||
*
|
|
||||||
* @param buffer The CANN buffer whose name is to be retrieved.
|
|
||||||
* @return A pointer to a C-string containing the name of the buffer.
|
|
||||||
*/
|
|
||||||
|
|
||||||
static const char* ggml_backend_cann_buffer_get_name(
|
|
||||||
ggml_backend_buffer_t buffer) {
|
|
||||||
return "CANN";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Check if a buffer is a CANN buffer.
|
* @brief Check if a buffer is a CANN buffer.
|
||||||
*
|
*
|
||||||
@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
|
|||||||
* @param buffer The buffer to check.
|
* @param buffer The buffer to check.
|
||||||
* @return true if the buffer is a CANN buffer, false otherwise.
|
* @return true if the buffer is a CANN buffer, false otherwise.
|
||||||
*/
|
*/
|
||||||
|
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
|
||||||
static bool ggml_backend_buffer_is_cann(
|
static bool ggml_backend_buffer_is_cann(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
return ggml_backend_buft_is_cann(buffer->buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
|
|||||||
* on a CANN buffer within the backend.
|
* on a CANN buffer within the backend.
|
||||||
*/
|
*/
|
||||||
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_cann_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
||||||
@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
|
|||||||
*/
|
*/
|
||||||
static const char* ggml_backend_cann_buffer_type_name(
|
static const char* ggml_backend_cann_buffer_type_name(
|
||||||
ggml_backend_buffer_type_t buft) {
|
ggml_backend_buffer_type_t buft) {
|
||||||
return "CANN";
|
ggml_backend_cann_buffer_type_context* buft_ctx =
|
||||||
|
(ggml_backend_cann_buffer_type_context*)buft->context;
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
return buft_ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Retrieves the default buffer type associated with the CANN backend.
|
|
||||||
*
|
|
||||||
* This function returns the buffer type specific to the device associated
|
|
||||||
* with the CANN backend. It is used to allocate buffers for computations
|
|
||||||
* performed by the backend.
|
|
||||||
*
|
|
||||||
* @param backend Pointer to the CANN backend structure.
|
|
||||||
* @return Pointer to the buffer type structure for the CANN backend.
|
|
||||||
*/
|
|
||||||
static ggml_backend_buffer_type_t
|
|
||||||
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
ggml_backend_cann_context* cann_ctx =
|
|
||||||
(ggml_backend_cann_context*)backend->context;
|
|
||||||
|
|
||||||
return ggml_backend_cann_buffer_type(cann_ctx->device);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Sets tensor data asynchronously in the CANN backend.
|
* @brief Sets tensor data asynchronously in the CANN backend.
|
||||||
*
|
*
|
||||||
@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
|||||||
static const ggml_backend_i ggml_backend_cann_interface = {
|
static const ggml_backend_i ggml_backend_cann_interface = {
|
||||||
/* .get_name = */ ggml_backend_cann_name,
|
/* .get_name = */ ggml_backend_cann_name,
|
||||||
/* .free = */ ggml_backend_cann_free,
|
/* .free = */ ggml_backend_cann_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
||||||
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
||||||
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
||||||
@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
||||||
/* .supports_op = */ NULL, // moved to device
|
|
||||||
/* .supports_buft = */ NULL, // moved to device
|
|
||||||
/* .offload_op = */ NULL, // moved to device
|
|
||||||
/* .event_record = */ ggml_backend_cann_event_record,
|
/* .event_record = */ ggml_backend_cann_event_record,
|
||||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||||
};
|
};
|
||||||
@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
|
@ -421,20 +421,15 @@ struct ggml_backend_cuda_buffer_context {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
||||||
return ctx->name.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
|
||||||
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
|
||||||
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
return ctx->dev_ptr;
|
return ctx->dev_ptr;
|
||||||
@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||||
@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|||||||
|
|
||||||
ggml_cuda_set_device(buft_ctx->device);
|
ggml_cuda_set_device(buft_ctx->device);
|
||||||
|
|
||||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
|
||||||
|
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_cuda_split_buffer_type_context {
|
struct ggml_backend_cuda_split_buffer_type_context {
|
||||||
|
int main_device;
|
||||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
||||||
|
std::string name;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_cuda_split_buffer_context {
|
struct ggml_backend_cuda_split_buffer_context {
|
||||||
@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
|
|||||||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return GGML_CUDA_NAME "_Split";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|
||||||
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
|
||||||
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
||||||
@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|||||||
// cuda split buffer type
|
// cuda split buffer type
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return GGML_CUDA_NAME "_Split";
|
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
||||||
@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
|||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
|
||||||
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
|
||||||
|
|
||||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
||||||
|
|
||||||
@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto it = buft_map.find(tensor_split_arr);
|
auto it = buft_map.find({main_device, tensor_split_arr});
|
||||||
if (it != buft_map.end()) {
|
if (it != buft_map.end()) {
|
||||||
return &it->second;
|
return &it->second;
|
||||||
}
|
}
|
||||||
|
auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
|
||||||
|
main_device,
|
||||||
|
tensor_split_arr,
|
||||||
|
GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type buft {
|
struct ggml_backend_buffer_type buft {
|
||||||
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
|
||||||
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
/* .context = */ ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
auto result = buft_map.emplace(tensor_split_arr, buft);
|
auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
|
||||||
return &result.first->second;
|
return &result.first->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return GGML_CUDA_NAME "_Host";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||||
}
|
}
|
||||||
@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
|
||||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
|
|
||||||
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||||
|
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||||
GGML_ASSERT(!(split && ne02 > 1));
|
GGML_ASSERT(!(split && ne02 > 1));
|
||||||
GGML_ASSERT(!(split && ne03 > 1));
|
GGML_ASSERT(!(split && ne03 > 1));
|
||||||
GGML_ASSERT(!(split && ne02 < ne12));
|
GGML_ASSERT(!(split && ne02 < ne12));
|
||||||
@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||||
|
|
||||||
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
|
||||||
|
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||||||
|
|
||||||
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
||||||
// why is this here instead of mul_mat?
|
// why is this here instead of mul_mat?
|
||||||
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
|
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
|
||||||
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
||||||
|
|
||||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
|
||||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||||
@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
assert(node->src[j]->buffer);
|
assert(node->src[j]->buffer);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
||||||
|
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||||||
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
||||||
if (stat == cudaErrorGraphExecUpdateFailure) {
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
// The pre-existing graph exec cannot be updated due to violated constraints
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
||||||
// so instead clear error and re-instantiate
|
// so instead clear error and re-instantiate
|
||||||
@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|||||||
static const ggml_backend_i ggml_backend_cuda_interface = {
|
static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_get_name,
|
/* .get_name = */ ggml_backend_cuda_get_name,
|
||||||
/* .free = */ ggml_backend_cuda_free,
|
/* .free = */ ggml_backend_cuda_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||||
@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||||
/* .supports_op = */ NULL, // moved to device
|
|
||||||
/* .supports_buft = */ NULL, // moved to device
|
|
||||||
/* .offload_op = */ NULL, // moved to device
|
|
||||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||||
};
|
};
|
||||||
@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
|
static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
return ggml_backend_cuda_init(ctx->device);
|
return ggml_backend_cuda_init(ctx->device);
|
||||||
@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
|
|||||||
return ggml_backend_cuda_host_buffer_type();
|
return ggml_backend_cuda_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(ptr);
|
|
||||||
GGML_UNUSED(size);
|
|
||||||
GGML_UNUSED(max_tensor_size);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: move these functions here
|
// TODO: move these functions here
|
||||||
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||||
|
|
||||||
|
// split buffers can only be used with GGML_OP_MUL_MAT
|
||||||
|
if (op->op != GGML_OP_MUL_MAT) {
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if all the sources are allocated on this device
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
|
||||||
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
|
||||||
|
if (buft_ctx->device != dev_ctx->device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
if (ggml_backend_buft_is_cuda_split(buft)) {
|
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
||||||
return true;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (ggml_backend_buft_is_cuda(buft)) {
|
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
switch (op->op) {
|
||||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
case GGML_OP_GET_ROWS:
|
||||||
return buft_ctx->device == dev_ctx->device;
|
return 0;
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
return op->ne[1];
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
return op->ne[2];
|
||||||
|
default:
|
||||||
|
return ggml_nrows(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
return get_op_batch_size(op) >= min_batch_size;
|
||||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|||||||
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
||||||
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
||||||
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
||||||
/* .init_backend = */ ggml_backend_cuda_device_init,
|
/* .init_backend = */ ggml_backend_cuda_device_init_backend,
|
||||||
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
||||||
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
||||||
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
|
/* .buffer_from_host_ptr = */ NULL,
|
||||||
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
||||||
|
@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
|
|
||||||
return ctx->name.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
auto * memory = (ggml_vk_memory *)buffer->context;
|
auto * memory = (ggml_vk_memory *)buffer->context;
|
||||||
if (ggml_vk_has_device()) {
|
if (ggml_vk_has_device()) {
|
||||||
@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
||||||
/* .get_name = */ ggml_backend_kompute_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
|
||||||
return ggml_backend_kompute_buffer_type(ctx->device);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
||||||
ggml_vk_graph_compute(ctx, cgraph);
|
ggml_vk_graph_compute(ctx, cgraph);
|
||||||
@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
|
|||||||
static struct ggml_backend_i kompute_backend_i = {
|
static struct ggml_backend_i kompute_backend_i = {
|
||||||
/* .get_name = */ ggml_backend_kompute_name,
|
/* .get_name = */ ggml_backend_kompute_name,
|
||||||
/* .free = */ ggml_backend_kompute_free,
|
/* .free = */ ggml_backend_kompute_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
|
||||||
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
|
@ -3247,12 +3247,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return "Metal";
|
|
||||||
|
|
||||||
UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
@ -3307,7 +3301,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -3432,6 +3425,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
return &ggml_backend_buffer_type_metal;
|
return &ggml_backend_buffer_type_metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "Metal_Mapped";
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .device = */ &g_ggml_backend_metal_device,
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_buffer_from_ptr_type_metal;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
||||||
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
@ -3508,7 +3524,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
@ -3529,12 +3545,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
return ggml_backend_metal_buffer_type();
|
|
||||||
|
|
||||||
UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
return ggml_metal_graph_compute(backend, cgraph);
|
return ggml_metal_graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
@ -3601,7 +3611,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||||||
static struct ggml_backend_i ggml_backend_metal_i = {
|
static struct ggml_backend_i ggml_backend_metal_i = {
|
||||||
/* .get_name = */ ggml_backend_metal_name,
|
/* .get_name = */ ggml_backend_metal_name,
|
||||||
/* .free = */ ggml_backend_metal_free,
|
/* .free = */ ggml_backend_metal_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -3611,9 +3620,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -3708,7 +3714,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
|
|||||||
std::shared_ptr<socket_t> sock;
|
std::shared_ptr<socket_t> sock;
|
||||||
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
||||||
uint64_t remote_ptr;
|
uint64_t remote_ptr;
|
||||||
std::string name;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// RPC helper functions
|
// RPC helper functions
|
||||||
@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|||||||
return sock;
|
return sock;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
||||||
return ctx->name.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
||||||
@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
|
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
|
||||||
@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
|||||||
if (response.remote_ptr != 0) {
|
if (response.remote_ptr != 0) {
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||||
ggml_backend_rpc_buffer_interface,
|
ggml_backend_rpc_buffer_interface,
|
||||||
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
|
||||||
response.remote_size);
|
response.remote_size);
|
||||||
return buffer;
|
return buffer;
|
||||||
} else {
|
} else {
|
||||||
@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
|
||||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
// this is no-op because we don't have any async operations
|
// this is no-op because we don't have any async operations
|
||||||
@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
|||||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_name,
|
/* .get_name = */ ggml_backend_rpc_name,
|
||||||
/* .free = */ ggml_backend_rpc_free,
|
/* .free = */ ggml_backend_rpc_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_async = */ NULL,
|
/* .cpy_tensor_async = */ NULL,
|
||||||
@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||||
// TODO: obtain value from the server
|
// TODO: obtain value from the server
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
UNUSED(dev);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
|
||||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
|
||||||
return ctx->name.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
|
return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
|
||||||
@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
|
|||||||
std::vector<queue_ptr> streams;
|
std::vector<queue_ptr> streams;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return GGML_SYCL_NAME "_Split";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
|
||||||
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
|
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
|
||||||
@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||||
// instead, we allocate them for each tensor separately in init_tensor
|
// instead, we allocate them for each tensor separately in init_tensor
|
||||||
@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
||||||
return GGML_SYCL_NAME "_Host";
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_sycl_host_free(buffer->context);
|
ggml_sycl_host_free(buffer->context);
|
||||||
}
|
}
|
||||||
@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|||||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
|
|
||||||
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
|||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
||||||
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||||
ggml_tensor *tensor,
|
ggml_tensor *tensor,
|
||||||
const void *data, size_t offset,
|
const void *data, size_t offset,
|
||||||
@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|||||||
static ggml_backend_i ggml_backend_sycl_interface = {
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_get_name,
|
/* .get_name = */ ggml_backend_sycl_get_name,
|
||||||
/* .free = */ ggml_backend_sycl_free,
|
/* .free = */ ggml_backend_sycl_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
||||||
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
||||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
||||||
@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||||
/* .supports_op = */ NULL, // moved to device
|
|
||||||
/* .supports_buft = */ NULL, // moved to device
|
|
||||||
/* .offload_op = */ NULL, // moved to device
|
|
||||||
/* .event_record = */ ggml_backend_sycl_event_record,
|
/* .event_record = */ ggml_backend_sycl_event_record,
|
||||||
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
||||||
};
|
};
|
||||||
@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
|||||||
return ctx->devices[index];
|
return ctx->devices[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
|
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||||
{
|
|
||||||
GGML_UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
|
||||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
// TODO: update to the current function signature
|
||||||
}
|
//if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||||
|
// return (void *)ggml_backend_sycl_split_buffer_type;
|
||||||
|
//}
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, left here for reference
|
// SYCL doesn't support registering host memory, left here for reference
|
||||||
// "ggml_backend_register_host_buffer"
|
// "ggml_backend_register_host_buffer"
|
||||||
// "ggml_backend_unregister_host_buffer"
|
// "ggml_backend_unregister_host_buffer"
|
||||||
|
@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
|||||||
|
|
||||||
// device backend
|
// device backend
|
||||||
|
|
||||||
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
||||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
||||||
return ctx->name.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
|
return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
||||||
/* .get_name = */ ggml_backend_vk_buffer_get_name,
|
|
||||||
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
||||||
@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
|
|||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
|
|
||||||
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
static ggml_backend_i ggml_backend_vk_interface = {
|
static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
/* .get_name = */ ggml_backend_vk_name,
|
/* .get_name = */ ggml_backend_vk_name,
|
||||||
/* .free = */ ggml_backend_vk_free,
|
/* .free = */ ggml_backend_vk_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
|
||||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||||
@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
/* .supports_op = */ NULL,
|
|
||||||
/* .supports_buft = */ NULL,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
|||||||
//////////////////////////
|
//////////////////////////
|
||||||
|
|
||||||
struct ggml_backend_vk_device_context {
|
struct ggml_backend_vk_device_context {
|
||||||
int device;
|
size_t device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
};
|
};
|
||||||
@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||||
UNUSED(dev);
|
UNUSED(dev);
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
|||||||
props->type = ggml_backend_vk_device_get_type(dev);
|
props->type = ggml_backend_vk_device_get_type(dev);
|
||||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
props->caps = {
|
props->caps = {
|
||||||
/* async */ false,
|
/* .async = */ false,
|
||||||
/* host_buffer */ true,
|
/* .host_buffer = */ true,
|
||||||
/* events */ false,
|
/* .buffer_from_host_ptr = */ false,
|
||||||
|
/* .events = */ false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||||
char desc[256];
|
char desc[256];
|
||||||
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||||
|
@ -4028,7 +4028,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|||||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||||
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||||
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||||
assert(false);
|
#ifndef NDEBUG
|
||||||
|
GGML_ABORT("not enough space in the context's memory pool");
|
||||||
|
#endif
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -205,7 +205,7 @@ extern "C" {
|
|||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||||
@ -274,10 +274,7 @@ extern "C" {
|
|||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||||
|
|
||||||
// main_gpu interpretation depends on split_mode:
|
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||||
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
|
||||||
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
|
||||||
// LLAMA_SPLIT_MODE_LAYER: ignored
|
|
||||||
int32_t main_gpu;
|
int32_t main_gpu;
|
||||||
|
|
||||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
|
@ -20,7 +20,7 @@ logger = logging.getLogger("compare-llama-bench")
|
|||||||
# Properties by which to differentiate results per commit:
|
# Properties by which to differentiate results per commit:
|
||||||
KEY_PROPERTIES = [
|
KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
"blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
3098
src/llama.cpp
3098
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user