diff --git a/README.md b/README.md index 76774c0b..50d07cd6 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ Optionally, you can use the following command-line flags: | `--cpu` | Use the CPU to generate text.| | `--load-in-8bit` | Load the model with 8-bit precision.| | `--load-in-4bit` | Load the model with 4-bit precision. Currently only works with LLaMA.| -| `--llama-bits` | Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. | +| `--llama-bits` | Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.| | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. | diff --git a/modules/shared.py b/modules/shared.py index f3f46329..3ea4ef41 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -68,7 +68,7 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.') parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.') parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.') -parser.add_argument('--llama-bits', type=int, default=0, help='Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.') +parser.add_argument('--llama-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.') parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')