mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
Various ctransformers fixes (#3556)
--------- Co-authored-by: cal066 <cal066@users.noreply.github.com>
This commit is contained in:
parent
b8df4a436e
commit
66c04c304d
19
README.md
19
README.md
@ -205,7 +205,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|--------------------------------------------|-------------|
|
|--------------------------------------------|-------------|
|
||||||
| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
|
| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, ctransformers |
|
||||||
|
|
||||||
#### Accelerate/transformers
|
#### Accelerate/transformers
|
||||||
|
|
||||||
@ -235,22 +235,33 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--quant_type QUANT_TYPE` | quant_type for 4-bit. Valid options: nf4, fp4. |
|
| `--quant_type QUANT_TYPE` | quant_type for 4-bit. Valid options: nf4, fp4. |
|
||||||
| `--use_double_quant` | use_double_quant for 4-bit. |
|
| `--use_double_quant` | use_double_quant for 4-bit. |
|
||||||
|
|
||||||
#### llama.cpp
|
#### GGML (for llama.cpp and ctransformers)
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|-------------|-------------|
|
|-------------|-------------|
|
||||||
| `--threads` | Number of threads to use. |
|
| `--threads` | Number of threads to use. |
|
||||||
| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
|
| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
|
||||||
|
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
|
||||||
|
| `--n_ctx N_CTX` | Size of the prompt context. |
|
||||||
|
|
||||||
|
#### llama.cpp
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|-------------|-------------|
|
||||||
| `--no-mmap` | Prevent mmap from being used. |
|
| `--no-mmap` | Prevent mmap from being used. |
|
||||||
| `--mlock` | Force the system to keep the model in RAM. |
|
| `--mlock` | Force the system to keep the model in RAM. |
|
||||||
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
|
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
|
||||||
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
|
|
||||||
| `--n_ctx N_CTX` | Size of the prompt context. |
|
|
||||||
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
|
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
|
||||||
| `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama-2 70b. |
|
| `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama-2 70b. |
|
||||||
| `--rms_norm_eps RMS_NORM_EPS` | 5e-6 is a good value for llama-2 models. |
|
| `--rms_norm_eps RMS_NORM_EPS` | 5e-6 is a good value for llama-2 models. |
|
||||||
| `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
|
| `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
|
||||||
|
|
||||||
|
#### ctransformers
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gpt_neox, falcon, llama, mpt, gpt_bigcode, dolly-v2, and replit are supported. |
|
||||||
|
|
||||||
#### AutoGPTQ
|
#### AutoGPTQ
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|
@ -10,6 +10,18 @@
|
|||||||
model_type: 'llama'
|
model_type: 'llama'
|
||||||
.*bloom:
|
.*bloom:
|
||||||
model_type: 'bloom'
|
model_type: 'bloom'
|
||||||
|
.*gpt2:
|
||||||
|
model_type: 'gpt2'
|
||||||
|
.*falcon:
|
||||||
|
model_type: 'falcon'
|
||||||
|
.*mpt:
|
||||||
|
model_type: 'mpt'
|
||||||
|
.*(starcoder|starchat):
|
||||||
|
model_type: 'gpt_bigcode'
|
||||||
|
.*dolly-v2:
|
||||||
|
model_type: 'dolly-v2'
|
||||||
|
.*replit:
|
||||||
|
model_type: 'replit'
|
||||||
llama-65b-gptq-3bit:
|
llama-65b-gptq-3bit:
|
||||||
groupsize: 'None'
|
groupsize: 'None'
|
||||||
.*(4bit|int4):
|
.*(4bit|int4):
|
||||||
@ -281,3 +293,5 @@ llama-65b-gptq-3bit:
|
|||||||
.*openchat:
|
.*openchat:
|
||||||
mode: 'instruct'
|
mode: 'instruct'
|
||||||
instruction_template: 'OpenChat'
|
instruction_template: 'OpenChat'
|
||||||
|
.*falcon.*-instruct:
|
||||||
|
mode: 'instruct'
|
||||||
|
@ -18,6 +18,7 @@ class CtransformersModel:
|
|||||||
threads=shared.args.threads,
|
threads=shared.args.threads,
|
||||||
gpu_layers=shared.args.n_gpu_layers,
|
gpu_layers=shared.args.n_gpu_layers,
|
||||||
batch_size=shared.args.n_batch,
|
batch_size=shared.args.n_batch,
|
||||||
|
context_length=shared.args.n_ctx,
|
||||||
stream=True
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -31,7 +32,7 @@ class CtransformersModel:
|
|||||||
return result, result
|
return result, result
|
||||||
|
|
||||||
def model_type_is_auto(self):
|
def model_type_is_auto(self):
|
||||||
return shared.args.model_type == "Auto" or shared.args.model_type == "None"
|
return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
|
||||||
|
|
||||||
def model_dir(self, path):
|
def model_dir(self, path):
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
@ -48,7 +49,7 @@ class CtransformersModel:
|
|||||||
def generate(self, prompt, state, callback=None):
|
def generate(self, prompt, state, callback=None):
|
||||||
prompt = prompt if type(prompt) is str else prompt.decode()
|
prompt = prompt if type(prompt) is str else prompt.decode()
|
||||||
# ctransformers uses -1 for random seed
|
# ctransformers uses -1 for random seed
|
||||||
generator = self.model._stream(
|
generator = self.model(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
max_new_tokens=state['max_new_tokens'],
|
max_new_tokens=state['max_new_tokens'],
|
||||||
temperature=state['temperature'],
|
temperature=state['temperature'],
|
||||||
|
@ -92,6 +92,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'llamacpp_HF_info',
|
'llamacpp_HF_info',
|
||||||
],
|
],
|
||||||
'ctransformers': [
|
'ctransformers': [
|
||||||
|
'n_ctx',
|
||||||
'n_gpu_layers',
|
'n_gpu_layers',
|
||||||
'n_batch',
|
'n_batch',
|
||||||
'threads',
|
'threads',
|
||||||
|
Loading…
Reference in New Issue
Block a user