Add a helpful error message when llama.cpp fails to load the model

2025-01-27 04:23:21 +01:00 · 2025-01-21 11:49:44 -08:00 · 2025-01-21 11:49:44 -08:00 · 39799adc47
commit 39799adc47
parent 079ace63ec
2 changed files with 26 additions and 2 deletions
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@ -202,7 +202,19 @@ class LlamacppHF(PreTrainedModel):
            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)

        Llama = llama_cpp_lib().Llama
-        model = Llama(**params)
+        try:
+            model = Llama(**params)
+        except Exception as e:
+            error_message = (
+                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
+                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
+                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
+                "\n"
+                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
+            )
+
+            raise type(e)(error_message) from e
+
        model.last_updated_index = -1

        return LlamacppHF(model, model_file)
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -108,7 +108,19 @@ class LlamaCppModel:
            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)

-        result.model = Llama(**params)
+        try:
+            result.model = Llama(**params)
+        except Exception as e:
+            error_message = (
+                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
+                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
+                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
+                "\n"
+                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
+            )
+
+            raise type(e)(error_message) from e
+
        if cache_capacity > 0:
            result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))