mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
This commit is contained in:
parent
90b19bd6ee
commit
b925f1f1b0
14
ggml-cuda.cu
14
ggml-cuda.cu
@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_cuda_host_malloc(size_t size) {
|
void * ggml_cuda_host_malloc(size_t size) {
|
||||||
void * ptr;
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * ptr = nullptr;
|
||||||
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
||||||
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
42
llama-util.h
42
llama-util.h
@ -395,6 +395,8 @@ struct llama_buffer {
|
|||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
|
llama_buffer() = default;
|
||||||
|
|
||||||
void resize(size_t size) {
|
void resize(size_t size) {
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
addr = new uint8_t[size];
|
addr = new uint8_t[size];
|
||||||
@ -404,27 +406,59 @@ struct llama_buffer {
|
|||||||
~llama_buffer() {
|
~llama_buffer() {
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llama_buffer(const llama_buffer&) = delete;
|
||||||
|
llama_buffer(llama_buffer&&) = delete;
|
||||||
|
llama_buffer& operator=(const llama_buffer&) = delete;
|
||||||
|
llama_buffer& operator=(llama_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
struct llama_ctx_buffer {
|
struct llama_ctx_buffer {
|
||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
|
bool is_cuda;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
|
llama_ctx_buffer() = default;
|
||||||
|
|
||||||
void resize(size_t size) {
|
void resize(size_t size) {
|
||||||
if (addr) {
|
free();
|
||||||
ggml_cuda_host_free(addr);
|
|
||||||
}
|
|
||||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||||
|
if (addr) {
|
||||||
|
is_cuda = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// fall back to pageable memory
|
||||||
|
addr = new uint8_t[size];
|
||||||
|
is_cuda = false;
|
||||||
|
}
|
||||||
this->size = size;
|
this->size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_ctx_buffer() {
|
void free() {
|
||||||
if (addr) {
|
if (addr) {
|
||||||
|
if (is_cuda) {
|
||||||
ggml_cuda_host_free(addr);
|
ggml_cuda_host_free(addr);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
delete[] addr;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
addr = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llama_ctx_buffer() {
|
||||||
|
free();
|
||||||
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
#else
|
#else
|
||||||
typedef llama_buffer llama_ctx_buffer;
|
typedef llama_buffer llama_ctx_buffer;
|
||||||
|
@ -727,8 +727,7 @@ struct llama_model_loader {
|
|||||||
LLAMA_ASSERT(offset == lt.size);
|
LLAMA_ASSERT(offset == lt.size);
|
||||||
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
||||||
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
||||||
std::vector<llama_buffer> tmp_bufs;
|
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
||||||
tmp_bufs.resize(lt.shards.size());
|
|
||||||
for (size_t i = 0; i < lt.shards.size(); i++) {
|
for (size_t i = 0; i < lt.shards.size(); i++) {
|
||||||
llama_load_tensor_shard & shard = lt.shards.at(i);
|
llama_load_tensor_shard & shard = lt.shards.at(i);
|
||||||
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
||||||
|
Loading…
Reference in New Issue
Block a user