llama.cpp: add 4-bit/8-bit kv cache options

2025-01-12 21:37:35 +01:00 · 2024-06-29 09:10:33 -07:00 · 2024-06-29 09:10:33 -07:00 · 4ea260098f
commit 4ea260098f
parent 220c1797fc
3 changed files with 18 additions and 0 deletions
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@ -221,6 +221,13 @@ class LlamacppHF(PreTrainedModel):
            'flash_attn': shared.args.flash_attn
        }

+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
        Llama = llama_cpp_lib().Llama
        model = Llama(**params)

--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -100,6 +100,13 @@ class LlamaCppModel:
            'flash_attn': shared.args.flash_attn
        }

+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
        result.model = Llama(**params)
        if cache_capacity > 0:
            result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -30,6 +30,8 @@ loaders_and_params = OrderedDict({
    'llama.cpp': [
        'n_ctx',
        'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
        'tensor_split',
        'n_batch',
        'threads',
@ -51,6 +53,8 @@ loaders_and_params = OrderedDict({
    'llamacpp_HF': [
        'n_ctx',
        'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
        'tensor_split',
        'n_batch',
        'threads',