From 39799adc4739c769e057ce253d31dbd08b0695c6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 21 Jan 2025 11:49:44 -0800
Subject: [PATCH] Add a helpful error message when llama.cpp fails to load the
 model

---
 modules/llamacpp_hf.py    | 14 +++++++++++++-
 modules/llamacpp_model.py | 14 +++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index f9964fe8..b3761e0f 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -202,7 +202,19 @@ class LlamacppHF(PreTrainedModel):
             params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
         Llama = llama_cpp_lib().Llama
-        model = Llama(**params)
+        try:
+            model = Llama(**params)
+        except Exception as e:
+            error_message = (
+                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
+                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
+                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
+                "\n"
+                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
+            )
+
+            raise type(e)(error_message) from e
+
         model.last_updated_index = -1
 
         return LlamacppHF(model, model_file)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index c79755e4..db25c66c 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -108,7 +108,19 @@ class LlamaCppModel:
             params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
             params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
-        result.model = Llama(**params)
+        try:
+            result.model = Llama(**params)
+        except Exception as e:
+            error_message = (
+                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
+                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
+                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
+                "\n"
+                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
+            )
+
+            raise type(e)(error_message) from e
+
         if cache_capacity > 0:
             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))