mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 03:12:23 +01:00
llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)
* llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
0a4a4a0982
commit
2777a84be4
21
llama.cpp
21
llama.cpp
@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
|
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
||||||
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
|
constexpr bool use_mmap = true;
|
||||||
|
#else
|
||||||
|
constexpr bool use_mmap = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
llama_model_loader ml(fname_inp, use_mmap);
|
||||||
|
if (ml.use_mmap) {
|
||||||
|
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
||||||
|
}
|
||||||
|
|
||||||
llama_model model;
|
llama_model model;
|
||||||
llm_load_arch(ml, model);
|
llm_load_arch(ml, model);
|
||||||
@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
if (!ml.use_mmap) {
|
||||||
read_data.resize(ggml_nbytes(tensor));
|
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||||
|
read_data.resize(ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
tensor->data = read_data.data();
|
||||||
}
|
}
|
||||||
tensor->data = read_data.data();
|
|
||||||
ml.load_data_for(tensor);
|
ml.load_data_for(tensor);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
||||||
|
Loading…
Reference in New Issue
Block a user