mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
Add no_flash_attn option
This commit is contained in:
parent
aaf726dbfb
commit
77abd9b69b
@ -336,6 +336,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
|
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
|
||||||
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|
||||||
|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
|
|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
|
||||||
|
|`--no_flash_attn` | Force flash-attention to not be used. |
|
||||||
|
|
||||||
#### AutoGPTQ
|
#### AutoGPTQ
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ class Exllamav2Model:
|
|||||||
config.max_seq_len = shared.args.max_seq_len
|
config.max_seq_len = shared.args.max_seq_len
|
||||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||||
config.scale_alpha_value = shared.args.alpha_value
|
config.scale_alpha_value = shared.args.alpha_value
|
||||||
|
config.no_flash_attn = shared.args.no_flash_attn
|
||||||
|
|
||||||
model = ExLlamaV2(config)
|
model = ExLlamaV2(config)
|
||||||
|
|
||||||
|
@ -152,5 +152,6 @@ class Exllamav2HF(PreTrainedModel):
|
|||||||
config.max_seq_len = shared.args.max_seq_len
|
config.max_seq_len = shared.args.max_seq_len
|
||||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||||
config.scale_alpha_value = shared.args.alpha_value
|
config.scale_alpha_value = shared.args.alpha_value
|
||||||
|
config.no_flash_attn = shared.args.no_flash_attn
|
||||||
|
|
||||||
return Exllamav2HF(config)
|
return Exllamav2HF(config)
|
||||||
|
@ -117,6 +117,7 @@ parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (
|
|||||||
parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
|
parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
|
||||||
parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
|
parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
|
||||||
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
|
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
|
||||||
|
parser.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
|
||||||
|
|
||||||
# AutoGPTQ
|
# AutoGPTQ
|
||||||
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
||||||
|
Loading…
Reference in New Issue
Block a user