mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 22:08:46 +01:00
Reduce model loading time (#43)
* Use buffering * Use vector * Minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
2a20f48efa
commit
63fd76fbb0
4
main.cpp
4
main.cpp
@ -87,7 +87,10 @@ struct llama_model {
|
|||||||
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
||||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
std::vector<char> f_buf(1024*1024);
|
||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -325,6 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
||||||
|
|
||||||
fin = std::ifstream(fname_part, std::ios::binary);
|
fin = std::ifstream(fname_part, std::ios::binary);
|
||||||
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||||
fin.seekg(file_offset);
|
fin.seekg(file_offset);
|
||||||
|
|
||||||
// load weights
|
// load weights
|
||||||
|
Loading…
Reference in New Issue
Block a user