diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 4899ad99..86537a27 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -49,6 +49,7 @@ class LlamaCppModel: 'n_batch': shared.args.n_batch, 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, + 'low_vram': shared.args.low_vram, 'n_gpu_layers': shared.args.n_gpu_layers } diff --git a/modules/loaders.py b/modules/loaders.py index 8ec575a7..e0db482c 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -34,6 +34,7 @@ loaders_and_params = { 'n_batch', 'threads', 'no_mmap', + 'low_vram', 'mlock', 'llama_cpp_seed', ], diff --git a/modules/shared.py b/modules/shared.py index 2b2fa061..4b6b9fe1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -120,6 +120,7 @@ parser.add_argument('--use_double_quant', action='store_true', help='use_double_ parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.') parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') +parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode') parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') diff --git a/modules/ui.py b/modules/ui.py index 9fea2880..704be925 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -57,6 +57,7 @@ def list_model_elements(): 'threads', 'n_batch', 'no_mmap', + 'low_vram', 'mlock', 'n_gpu_layers', 'n_ctx', diff --git a/server.py b/server.py index edd91bc7..2723b284 100644 --- a/server.py +++ b/server.py @@ -248,6 +248,7 @@ def create_model_menus(): shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) + shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')