mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
cuda : allocate all temporary ggml_tensor_extra_gpu from a fixed-size buffer (#2220)
This commit is contained in:
parent
e8035f141e
commit
7cdd30bf1f
24
ggml-cuda.cu
24
ggml-cuda.cu
@ -3646,6 +3646,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|||||||
delete extra;
|
delete extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||||
|
static size_t g_temp_tensor_extra_index = 0;
|
||||||
|
|
||||||
|
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
|
if (g_temp_tensor_extras == nullptr) {
|
||||||
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t alloc_index = g_temp_tensor_extra_index;
|
||||||
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||||
|
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||||
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
|
return extra;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
||||||
if (scratch && g_scratch_size == 0) {
|
if (scratch && g_scratch_size == 0) {
|
||||||
return;
|
return;
|
||||||
@ -3663,8 +3679,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|||||||
}
|
}
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_GPU;
|
||||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
struct ggml_tensor_extra_gpu * extra;
|
||||||
memset(extra, 0, sizeof(*extra));
|
|
||||||
|
|
||||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||||
tensor->op == GGML_OP_VIEW ||
|
tensor->op == GGML_OP_VIEW ||
|
||||||
@ -3679,10 +3694,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|||||||
if (tensor->op == GGML_OP_VIEW) {
|
if (tensor->op == GGML_OP_VIEW) {
|
||||||
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
||||||
}
|
}
|
||||||
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||||
} else if (tensor->op == GGML_OP_CPY) {
|
} else if (tensor->op == GGML_OP_CPY) {
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||||
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
extra->data_device[g_main_device] = src1_ddv;
|
extra->data_device[g_main_device] = src1_ddv;
|
||||||
} else if (scratch) {
|
} else if (scratch) {
|
||||||
GGML_ASSERT(size <= g_scratch_size);
|
GGML_ASSERT(size <= g_scratch_size);
|
||||||
@ -3695,6 +3712,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|||||||
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
||||||
g_scratch_buffer = data;
|
g_scratch_buffer = data;
|
||||||
}
|
}
|
||||||
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
extra->data_device[g_main_device] = data + g_scratch_offset;
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
||||||
|
|
||||||
g_scratch_offset += size;
|
g_scratch_offset += size;
|
||||||
@ -3704,6 +3722,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|||||||
void * data;
|
void * data;
|
||||||
CUDA_CHECK(cudaMalloc(&data, size));
|
CUDA_CHECK(cudaMalloc(&data, size));
|
||||||
CUDA_CHECK(cudaMemset(data, 0, size));
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
||||||
|
extra = new ggml_tensor_extra_gpu;
|
||||||
|
memset(extra, 0, sizeof(*extra));
|
||||||
extra->data_device[g_main_device] = data;
|
extra->data_device[g_main_device] = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user