From 66c04c304deb89ecb8286e3dbcfda5d0c31b6a32 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 13 Aug 2023 22:09:03 -0400 Subject: [PATCH] Various ctransformers fixes (#3556) --------- Co-authored-by: cal066 --- README.md | 19 +++++++++++++++---- models/config.yaml | 14 ++++++++++++++ modules/ctransformers_model.py | 5 +++-- modules/loaders.py | 1 + 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 73ae33bd..9201df13 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ Optionally, you can use the following command-line flags: | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, ctransformers | #### Accelerate/transformers @@ -235,22 +235,33 @@ Optionally, you can use the following command-line flags: | `--quant_type QUANT_TYPE` | quant_type for 4-bit. Valid options: nf4, fp4. | | `--use_double_quant` | use_double_quant for 4-bit. | -#### llama.cpp +#### GGML (for llama.cpp and ctransformers) | Flag | Description | |-------------|-------------| | `--threads` | Number of threads to use. | | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. | +| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. | +| `--n_ctx N_CTX` | Size of the prompt context. | + +#### llama.cpp + +| Flag | Description | +|-------------|-------------| | `--no-mmap` | Prevent mmap from being used. | | `--mlock` | Force the system to keep the model in RAM. | | `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. | -| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. | -| `--n_ctx N_CTX` | Size of the prompt context. | | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). | | `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama-2 70b. | | `--rms_norm_eps RMS_NORM_EPS` | 5e-6 is a good value for llama-2 models. | | `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. | +#### ctransformers + +| Flag | Description | +|-------------|-------------| +| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gpt_neox, falcon, llama, mpt, gpt_bigcode, dolly-v2, and replit are supported. | + #### AutoGPTQ | Flag | Description | diff --git a/models/config.yaml b/models/config.yaml index 3d5f48ff..ba12e8bc 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -10,6 +10,18 @@ model_type: 'llama' .*bloom: model_type: 'bloom' +.*gpt2: + model_type: 'gpt2' +.*falcon: + model_type: 'falcon' +.*mpt: + model_type: 'mpt' +.*(starcoder|starchat): + model_type: 'gpt_bigcode' +.*dolly-v2: + model_type: 'dolly-v2' +.*replit: + model_type: 'replit' llama-65b-gptq-3bit: groupsize: 'None' .*(4bit|int4): @@ -281,3 +293,5 @@ llama-65b-gptq-3bit: .*openchat: mode: 'instruct' instruction_template: 'OpenChat' +.*falcon.*-instruct: + mode: 'instruct' diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py index 74c4018a..5e0f347c 100644 --- a/modules/ctransformers_model.py +++ b/modules/ctransformers_model.py @@ -18,6 +18,7 @@ class CtransformersModel: threads=shared.args.threads, gpu_layers=shared.args.n_gpu_layers, batch_size=shared.args.n_batch, + context_length=shared.args.n_ctx, stream=True ) @@ -31,7 +32,7 @@ class CtransformersModel: return result, result def model_type_is_auto(self): - return shared.args.model_type == "Auto" or shared.args.model_type == "None" + return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None" def model_dir(self, path): if path.is_file(): @@ -48,7 +49,7 @@ class CtransformersModel: def generate(self, prompt, state, callback=None): prompt = prompt if type(prompt) is str else prompt.decode() # ctransformers uses -1 for random seed - generator = self.model._stream( + generator = self.model( prompt=prompt, max_new_tokens=state['max_new_tokens'], temperature=state['temperature'], diff --git a/modules/loaders.py b/modules/loaders.py index 2b3a50b3..f7288f90 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -92,6 +92,7 @@ loaders_and_params = OrderedDict({ 'llamacpp_HF_info', ], 'ctransformers': [ + 'n_ctx', 'n_gpu_layers', 'n_batch', 'threads',