llama : Add ability to cancel model load

Updated llama_progress_callback so that if it returns false, the model
loading is aborted.
This commit is contained in:
crasm 2023-12-14 04:03:25 -05:00
parent 55e87c3749
commit 9abe2e44d1
2 changed files with 36 additions and 15 deletions

View File

@ -2297,7 +2297,8 @@ struct llama_model_loader {
} }
} }
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { // Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t size_data = 0; size_t size_data = 0;
size_t size_lock = 0; size_t size_lock = 0;
size_t size_pref = 0; // prefetch size_t size_pref = 0; // prefetch
@ -2323,7 +2324,9 @@ struct llama_model_loader {
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
if (progress_callback) { if (progress_callback) {
progress_callback((float) done_size / size_data, progress_callback_user_data); if (!progress_callback((float) done_size / size_data, progress_callback_user_data)) {
return false;
}
} }
// allocate temp buffer if not using mmap // allocate temp buffer if not using mmap
@ -2371,6 +2374,7 @@ struct llama_model_loader {
done_size += ggml_nbytes(cur); done_size += ggml_nbytes(cur);
} }
return true;
} }
}; };
@ -2937,7 +2941,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
} }
static void llm_load_tensors( // Returns false if cancelled by progress_callback
static bool llm_load_tensors(
llama_model_loader & ml, llama_model_loader & ml,
llama_model & model, llama_model & model,
int n_gpu_layers, int n_gpu_layers,
@ -2948,6 +2953,8 @@ static void llm_load_tensors(
void * progress_callback_user_data) { void * progress_callback_user_data) {
model.t_start_us = ggml_time_us(); model.t_start_us = ggml_time_us();
bool ok = true; // if false, model load was cancelled
auto & ctx = model.ctx; auto & ctx = model.ctx;
auto & hparams = model.hparams; auto & hparams = model.hparams;
@ -3678,10 +3685,11 @@ static void llm_load_tensors(
} }
#endif #endif
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
if (progress_callback) { if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data); // Even though the model is done loading, we still honor
// cancellation since we need to free allocations.
ok = ok && progress_callback(1.0f, progress_callback_user_data);
} }
model.mapping = std::move(ml.mapping); model.mapping = std::move(ml.mapping);
@ -3689,9 +3697,11 @@ static void llm_load_tensors(
// loading time will be recalculate after the first eval, so // loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration // we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us; model.t_load_us = ggml_time_us() - model.t_start_us;
return ok;
} }
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { // Returns -1 on error, -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
try { try {
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@ -3712,16 +3722,18 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
return true; return true;
} }
llm_load_tensors( if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data params.progress_callback, params.progress_callback_user_data
); )) {
return -2;
}
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
return false; return -1;
} }
return true; return 0;
} }
// //
@ -9017,11 +9029,18 @@ struct llama_model * llama_load_model_from_file(
LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("\n");
} }
} }
return true;
}; };
} }
if (!llama_model_load(path_model, *model, params)) { int status = llama_model_load(path_model, *model, params);
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s, cancelled model load\n", __func__);
}
delete model; delete model;
return nullptr; return nullptr;
} }

View File

@ -126,7 +126,7 @@ extern "C" {
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef bool (*llama_progress_callback)(float progress, void *ctx);
// Input data for llama_decode // Input data for llama_decode
// A llama_batch object can contain input about one or many sequences // A llama_batch object can contain input about one or many sequences
@ -179,7 +179,9 @@ extern "C" {
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// called with a progress value between 0 and 1, pass NULL to disable // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback