mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-29 02:49:30 +01:00
Add RoPE scaling support for transformers (including dynamic NTK)
https://github.com/huggingface/transformers/pull/24653
This commit is contained in:
parent
f4caaf337a
commit
d8fb506aff
@ -299,12 +299,12 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
|
| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
|
||||||
| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. |
|
| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. |
|
||||||
|
|
||||||
#### RoPE (for llama.cpp and ExLlama only)
|
#### RoPE (for llama.cpp, ExLlama, and transformers)
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|------------------|-------------|
|
|------------------|-------------|
|
||||||
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
|
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
|
||||||
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
|
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
|
||||||
|
|
||||||
#### Gradio
|
#### Gradio
|
||||||
|
|
||||||
|
@ -39,8 +39,8 @@ loaders_and_params = {
|
|||||||
'low_vram',
|
'low_vram',
|
||||||
'mlock',
|
'mlock',
|
||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
'compress_pos_emb',
|
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
'cpu',
|
'cpu',
|
||||||
],
|
],
|
||||||
'llamacpp_HF': [
|
'llamacpp_HF': [
|
||||||
@ -54,8 +54,8 @@ loaders_and_params = {
|
|||||||
'low_vram',
|
'low_vram',
|
||||||
'mlock',
|
'mlock',
|
||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
'compress_pos_emb',
|
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
'cpu',
|
'cpu',
|
||||||
'llamacpp_HF_info',
|
'llamacpp_HF_info',
|
||||||
],
|
],
|
||||||
@ -73,20 +73,22 @@ loaders_and_params = {
|
|||||||
'quant_type',
|
'quant_type',
|
||||||
'compute_dtype',
|
'compute_dtype',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
'transformers_info'
|
'transformers_info'
|
||||||
],
|
],
|
||||||
'ExLlama': [
|
'ExLlama': [
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb',
|
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
'exllama_info',
|
'exllama_info',
|
||||||
],
|
],
|
||||||
'ExLlama_HF': [
|
'ExLlama_HF': [
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb',
|
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
'exllama_HF_info',
|
'exllama_HF_info',
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -144,7 +144,7 @@ def huggingface_loader(model_name):
|
|||||||
LoaderClass = AutoModelForCausalLM
|
LoaderClass = AutoModelForCausalLM
|
||||||
|
|
||||||
# Load the model in simple 16-bit mode by default
|
# Load the model in simple 16-bit mode by default
|
||||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
|
||||||
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
|
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
|
||||||
if torch.backends.mps.is_available():
|
if torch.backends.mps.is_available():
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
@ -215,6 +215,11 @@ def huggingface_loader(model_name):
|
|||||||
no_split_module_classes=model._no_split_modules
|
no_split_module_classes=model._no_split_modules
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if shared.args.compress_pos_emb > 1:
|
||||||
|
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
||||||
|
elif shared.args.alpha_value > 1:
|
||||||
|
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
|
||||||
|
|
||||||
model = LoaderClass.from_pretrained(checkpoint, **params)
|
model = LoaderClass.from_pretrained(checkpoint, **params)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
@ -164,7 +164,7 @@ parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile t
|
|||||||
|
|
||||||
# RoPE
|
# RoPE
|
||||||
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
|
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
|
||||||
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.")
|
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
|
||||||
|
|
||||||
# Gradio
|
# Gradio
|
||||||
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
|
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
|
||||||
|
@ -89,8 +89,8 @@ def create_ui():
|
|||||||
shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
|
shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
|
||||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||||
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
||||||
|
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
||||||
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
|
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
|
||||||
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||||
|
Loading…
Reference in New Issue
Block a user