diff --git a/README.md b/README.md index f7cc95e0..85115a28 100644 --- a/README.md +++ b/README.md @@ -287,6 +287,7 @@ Optionally, you can use the following command-line flags: | Flag | Description | |-------------|-------------| | `--threads` | Number of threads to use. | +| `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. | | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. | | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. | | `--n_ctx N_CTX` | Size of the prompt context. | diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 366fdf85..c5abdef3 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -194,6 +194,7 @@ class LlamacppHF(PreTrainedModel): 'n_ctx': shared.args.n_ctx, 'seed': int(shared.args.llama_cpp_seed), 'n_threads': shared.args.threads or None, + 'n_threads_batch': shared.args.threads_batch or None, 'n_batch': shared.args.n_batch, 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 92c679e6..554da2b5 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -76,6 +76,7 @@ class LlamaCppModel: 'n_ctx': shared.args.n_ctx, 'seed': int(shared.args.llama_cpp_seed), 'n_threads': shared.args.threads or None, + 'n_threads_batch': shared.args.threads_batch or None, 'n_batch': shared.args.n_batch, 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, diff --git a/modules/loaders.py b/modules/loaders.py index 1b6bef06..7580e30e 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -91,6 +91,7 @@ loaders_and_params = OrderedDict({ 'tensor_split', 'n_batch', 'threads', + 'threads_batch', 'no_mmap', 'mlock', 'mul_mat_q', @@ -107,6 +108,7 @@ loaders_and_params = OrderedDict({ 'tensor_split', 'n_batch', 'threads', + 'threads_batch', 'no_mmap', 'mlock', 'mul_mat_q', diff --git a/modules/shared.py b/modules/shared.py index 01e82ee8..6e4965bb 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -115,6 +115,7 @@ parser.add_argument('--use_double_quant', action='store_true', help='use_double_ # llama.cpp parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.') +parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') diff --git a/modules/ui.py b/modules/ui.py index 6271b0d4..77e56e92 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -69,6 +69,7 @@ def list_model_elements(): 'disable_exllama', 'cfg_cache', 'threads', + 'threads_batch', 'n_batch', 'no_mmap', 'mlock', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index a562d1ed..49c5a611 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -83,6 +83,7 @@ def create_ui(): shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers) shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx) shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) + shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch) shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch) shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")