mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-26 01:30:20 +01:00
Add rope_freq_base parameter for CodeLlama
This commit is contained in:
parent
feecd8190f
commit
52ab2a6b9e
@ -337,8 +337,9 @@ Optionally, you can use the following command-line flags:
|
|||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
|------------------|-------------|
|
|------------------|-------------|
|
||||||
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. |
|
| `--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
|
||||||
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
|
| `--rope_freq_base ROPE_FREQ_BASE` | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)
|
||||||
|
| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
|
||||||
|
|
||||||
#### Gradio
|
#### Gradio
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import version as torch_version
|
from torch import version as torch_version
|
||||||
|
|
||||||
from modules import shared
|
from modules import RoPE, shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import clear_torch_cache
|
from modules.models import clear_torch_cache
|
||||||
from modules.text_generation import get_max_prompt_length
|
from modules.text_generation import get_max_prompt_length
|
||||||
@ -56,8 +56,8 @@ class ExllamaModel:
|
|||||||
config.set_auto_map(shared.args.gpu_split)
|
config.set_auto_map(shared.args.gpu_split)
|
||||||
config.gpu_peer_fix = True
|
config.gpu_peer_fix = True
|
||||||
|
|
||||||
if shared.args.alpha_value:
|
if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
|
||||||
config.alpha_value = shared.args.alpha_value
|
config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
|
||||||
config.calculate_rotary_embedding_base()
|
config.calculate_rotary_embedding_base()
|
||||||
|
|
||||||
if torch_version.hip:
|
if torch_version.hip:
|
||||||
|
@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
|
|||||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
from modules import shared
|
from modules import RoPE, shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -134,8 +134,8 @@ class ExllamaHF(PreTrainedModel):
|
|||||||
config.set_auto_map(shared.args.gpu_split)
|
config.set_auto_map(shared.args.gpu_split)
|
||||||
config.gpu_peer_fix = True
|
config.gpu_peer_fix = True
|
||||||
|
|
||||||
if shared.args.alpha_value:
|
if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
|
||||||
config.alpha_value = shared.args.alpha_value
|
config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
|
||||||
config.calculate_rotary_embedding_base()
|
config.calculate_rotary_embedding_base()
|
||||||
|
|
||||||
if torch.version.hip:
|
if torch.version.hip:
|
||||||
|
@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
|
|||||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
from modules import shared
|
from modules import RoPE, shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
import llama_cpp
|
import llama_cpp
|
||||||
@ -185,7 +185,7 @@ class LlamacppHF(PreTrainedModel):
|
|||||||
'mul_mat_q': shared.args.mul_mat_q,
|
'mul_mat_q': shared.args.mul_mat_q,
|
||||||
'low_vram': shared.args.low_vram,
|
'low_vram': shared.args.low_vram,
|
||||||
'n_gpu_layers': shared.args.n_gpu_layers,
|
'n_gpu_layers': shared.args.n_gpu_layers,
|
||||||
'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
|
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
|
||||||
'tensor_split': tensor_split_list,
|
'tensor_split': tensor_split_list,
|
||||||
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
||||||
'n_gqa': shared.args.n_gqa or None,
|
'n_gqa': shared.args.n_gqa or None,
|
||||||
|
@ -3,7 +3,7 @@ from functools import partial
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from modules import shared
|
from modules import RoPE, shared
|
||||||
from modules.callbacks import Iteratorize
|
from modules.callbacks import Iteratorize
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.text_generation import get_max_prompt_length
|
from modules.text_generation import get_max_prompt_length
|
||||||
@ -72,7 +72,7 @@ class LlamaCppModel:
|
|||||||
'mul_mat_q': shared.args.mul_mat_q,
|
'mul_mat_q': shared.args.mul_mat_q,
|
||||||
'low_vram': shared.args.low_vram,
|
'low_vram': shared.args.low_vram,
|
||||||
'n_gpu_layers': shared.args.n_gpu_layers,
|
'n_gpu_layers': shared.args.n_gpu_layers,
|
||||||
'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.),
|
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
|
||||||
'tensor_split': tensor_split_list,
|
'tensor_split': tensor_split_list,
|
||||||
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
||||||
'n_gqa': shared.args.n_gqa or None,
|
'n_gqa': shared.args.n_gqa or None,
|
||||||
|
@ -21,6 +21,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'compute_dtype',
|
'compute_dtype',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'transformers_info'
|
'transformers_info'
|
||||||
],
|
],
|
||||||
@ -28,6 +29,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'exllama_HF_info',
|
'exllama_HF_info',
|
||||||
@ -36,6 +38,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'exllama_info',
|
'exllama_info',
|
||||||
],
|
],
|
||||||
@ -77,6 +80,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'mul_mat_q',
|
'mul_mat_q',
|
||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'cpu',
|
'cpu',
|
||||||
],
|
],
|
||||||
@ -93,6 +97,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'mlock',
|
'mlock',
|
||||||
'mul_mat_q',
|
'mul_mat_q',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'cpu',
|
'cpu',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
|
@ -18,7 +18,7 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules import llama_attn_hijack, sampler_hijack
|
from modules import llama_attn_hijack, RoPE, sampler_hijack
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models_settings import infer_loader
|
from modules.models_settings import infer_loader
|
||||||
|
|
||||||
@ -219,7 +219,7 @@ def huggingface_loader(model_name):
|
|||||||
if shared.args.compress_pos_emb > 1:
|
if shared.args.compress_pos_emb > 1:
|
||||||
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
||||||
elif shared.args.alpha_value > 1:
|
elif shared.args.alpha_value > 1:
|
||||||
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
|
params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
|
||||||
|
|
||||||
model = LoaderClass.from_pretrained(checkpoint, **params)
|
model = LoaderClass.from_pretrained(checkpoint, **params)
|
||||||
|
|
||||||
|
@ -159,8 +159,9 @@ parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The s
|
|||||||
parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
|
parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
|
||||||
|
|
||||||
# RoPE
|
# RoPE
|
||||||
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
|
|
||||||
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
|
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
|
||||||
|
parser.add_argument('--rope_freq_base', type=int, default=1, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)")
|
||||||
|
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
|
||||||
|
|
||||||
# Gradio
|
# Gradio
|
||||||
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
|
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
|
||||||
|
@ -79,7 +79,8 @@ def list_model_elements():
|
|||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'alpha_value'
|
'alpha_value',
|
||||||
|
'rope_freq_base'
|
||||||
]
|
]
|
||||||
|
|
||||||
for i in range(torch.cuda.device_count()):
|
for i in range(torch.cuda.device_count()):
|
||||||
|
@ -91,7 +91,8 @@ def create_ui():
|
|||||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||||
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
||||||
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
||||||
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length).', value=shared.args.compress_pos_emb)
|
shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=100000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
|
||||||
|
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||||
|
Loading…
Reference in New Issue
Block a user