diff --git a/README.md b/README.md
index 73ae33bd..9201df13 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, ctransformers |
 
 #### Accelerate/transformers
 
@@ -235,22 +235,33 @@ Optionally, you can use the following command-line flags:
 | `--quant_type QUANT_TYPE`                   | quant_type for 4-bit. Valid options: nf4, fp4. |
 | `--use_double_quant`                        | use_double_quant for 4-bit. |
 
-#### llama.cpp
+#### GGML (for llama.cpp and ctransformers)
 
 | Flag        | Description |
 |-------------|-------------|
 | `--threads` | Number of threads to use. |
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
+| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
+| `--n_ctx N_CTX` | Size of the prompt context. |
+
+#### llama.cpp
+
+| Flag        | Description |
+|-------------|-------------|
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
-| `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama-2 70b. |
 | `--rms_norm_eps RMS_NORM_EPS`  | 5e-6 is a good value for llama-2 models. |
 | `--cpu`                        | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
 
+#### ctransformers
+
+| Flag        | Description |
+|-------------|-------------|
+| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gpt_neox, falcon, llama, mpt, gpt_bigcode, dolly-v2, and replit are supported. |
+
 #### AutoGPTQ
 
 | Flag             | Description |
diff --git a/models/config.yaml b/models/config.yaml
index 3d5f48ff..ba12e8bc 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -10,6 +10,18 @@
   model_type: 'llama'
 .*bloom:
   model_type: 'bloom'
+.*gpt2:
+  model_type: 'gpt2'
+.*falcon:
+  model_type: 'falcon'
+.*mpt:
+  model_type: 'mpt'
+.*(starcoder|starchat):
+  model_type: 'gpt_bigcode'
+.*dolly-v2:
+  model_type: 'dolly-v2'
+.*replit:
+  model_type: 'replit'
 llama-65b-gptq-3bit:
   groupsize: 'None'
 .*(4bit|int4):
@@ -281,3 +293,5 @@ llama-65b-gptq-3bit:
 .*openchat:
   mode: 'instruct'
   instruction_template: 'OpenChat'
+.*falcon.*-instruct:
+  mode: 'instruct'
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
index 74c4018a..5e0f347c 100644
--- a/modules/ctransformers_model.py
+++ b/modules/ctransformers_model.py
@@ -18,6 +18,7 @@ class CtransformersModel:
             threads=shared.args.threads,
             gpu_layers=shared.args.n_gpu_layers,
             batch_size=shared.args.n_batch,
+            context_length=shared.args.n_ctx,
             stream=True
         )
 
@@ -31,7 +32,7 @@ class CtransformersModel:
         return result, result
 
     def model_type_is_auto(self):
-        return shared.args.model_type == "Auto" or shared.args.model_type == "None"
+        return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
 
     def model_dir(self, path):
         if path.is_file():
@@ -48,7 +49,7 @@ class CtransformersModel:
     def generate(self, prompt, state, callback=None):
         prompt = prompt if type(prompt) is str else prompt.decode()
         # ctransformers uses -1 for random seed
-        generator = self.model._stream(
+        generator = self.model(
             prompt=prompt,
             max_new_tokens=state['max_new_tokens'],
             temperature=state['temperature'],
diff --git a/modules/loaders.py b/modules/loaders.py
index 2b3a50b3..f7288f90 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -92,6 +92,7 @@ loaders_and_params = OrderedDict({
         'llamacpp_HF_info',
     ],
     'ctransformers': [
+        'n_ctx',
         'n_gpu_layers',
         'n_batch',
         'threads',