mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-29 19:09:32 +01:00
Add threads_batch parameter
This commit is contained in:
parent
41a2de96e5
commit
b6fe6acf88
@ -287,6 +287,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|-------------|-------------|
|
|-------------|-------------|
|
||||||
| `--threads` | Number of threads to use. |
|
| `--threads` | Number of threads to use. |
|
||||||
|
| `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. |
|
||||||
| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
|
| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
|
||||||
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
|
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
|
||||||
| `--n_ctx N_CTX` | Size of the prompt context. |
|
| `--n_ctx N_CTX` | Size of the prompt context. |
|
||||||
|
@ -194,6 +194,7 @@ class LlamacppHF(PreTrainedModel):
|
|||||||
'n_ctx': shared.args.n_ctx,
|
'n_ctx': shared.args.n_ctx,
|
||||||
'seed': int(shared.args.llama_cpp_seed),
|
'seed': int(shared.args.llama_cpp_seed),
|
||||||
'n_threads': shared.args.threads or None,
|
'n_threads': shared.args.threads or None,
|
||||||
|
'n_threads_batch': shared.args.threads_batch or None,
|
||||||
'n_batch': shared.args.n_batch,
|
'n_batch': shared.args.n_batch,
|
||||||
'use_mmap': not shared.args.no_mmap,
|
'use_mmap': not shared.args.no_mmap,
|
||||||
'use_mlock': shared.args.mlock,
|
'use_mlock': shared.args.mlock,
|
||||||
|
@ -76,6 +76,7 @@ class LlamaCppModel:
|
|||||||
'n_ctx': shared.args.n_ctx,
|
'n_ctx': shared.args.n_ctx,
|
||||||
'seed': int(shared.args.llama_cpp_seed),
|
'seed': int(shared.args.llama_cpp_seed),
|
||||||
'n_threads': shared.args.threads or None,
|
'n_threads': shared.args.threads or None,
|
||||||
|
'n_threads_batch': shared.args.threads_batch or None,
|
||||||
'n_batch': shared.args.n_batch,
|
'n_batch': shared.args.n_batch,
|
||||||
'use_mmap': not shared.args.no_mmap,
|
'use_mmap': not shared.args.no_mmap,
|
||||||
'use_mlock': shared.args.mlock,
|
'use_mlock': shared.args.mlock,
|
||||||
|
@ -91,6 +91,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'tensor_split',
|
'tensor_split',
|
||||||
'n_batch',
|
'n_batch',
|
||||||
'threads',
|
'threads',
|
||||||
|
'threads_batch',
|
||||||
'no_mmap',
|
'no_mmap',
|
||||||
'mlock',
|
'mlock',
|
||||||
'mul_mat_q',
|
'mul_mat_q',
|
||||||
@ -107,6 +108,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'tensor_split',
|
'tensor_split',
|
||||||
'n_batch',
|
'n_batch',
|
||||||
'threads',
|
'threads',
|
||||||
|
'threads_batch',
|
||||||
'no_mmap',
|
'no_mmap',
|
||||||
'mlock',
|
'mlock',
|
||||||
'mul_mat_q',
|
'mul_mat_q',
|
||||||
|
@ -115,6 +115,7 @@ parser.add_argument('--use_double_quant', action='store_true', help='use_double_
|
|||||||
|
|
||||||
# llama.cpp
|
# llama.cpp
|
||||||
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||||
|
parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||||
parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||||
parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
||||||
parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
||||||
|
@ -69,6 +69,7 @@ def list_model_elements():
|
|||||||
'disable_exllama',
|
'disable_exllama',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'threads',
|
'threads',
|
||||||
|
'threads_batch',
|
||||||
'n_batch',
|
'n_batch',
|
||||||
'no_mmap',
|
'no_mmap',
|
||||||
'mlock',
|
'mlock',
|
||||||
|
@ -83,6 +83,7 @@ def create_ui():
|
|||||||
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
|
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
|
||||||
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx)
|
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx)
|
||||||
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
|
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
|
||||||
|
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch)
|
||||||
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
|
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
|
||||||
|
|
||||||
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
|
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
|
||||||
|
Loading…
Reference in New Issue
Block a user