Add rope_freq_base parameter for CodeLlama

This commit is contained in:
oobabooga 2023-08-25 06:53:37 -07:00
parent feecd8190f
commit 52ab2a6b9e
10 changed files with 26 additions and 17 deletions

View File

@ -337,8 +337,9 @@ Optionally, you can use the following command-line flags:
| Flag | Description | | Flag | Description |
|------------------|-------------| |------------------|-------------|
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. | | `--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. | | `--rope_freq_base ROPE_FREQ_BASE` | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)
| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
#### Gradio #### Gradio

View File

@ -3,7 +3,7 @@ from pathlib import Path
import torch.nn.functional as F import torch.nn.functional as F
from torch import version as torch_version from torch import version as torch_version
from modules import shared from modules import RoPE, shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import clear_torch_cache from modules.models import clear_torch_cache
from modules.text_generation import get_max_prompt_length from modules.text_generation import get_max_prompt_length
@ -56,8 +56,8 @@ class ExllamaModel:
config.set_auto_map(shared.args.gpu_split) config.set_auto_map(shared.args.gpu_split)
config.gpu_peer_fix = True config.gpu_peer_fix = True
if shared.args.alpha_value: if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
config.alpha_value = shared.args.alpha_value config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
config.calculate_rotary_embedding_base() config.calculate_rotary_embedding_base()
if torch_version.hip: if torch_version.hip:

View File

@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared from modules import RoPE, shared
from modules.logging_colors import logger from modules.logging_colors import logger
try: try:
@ -134,8 +134,8 @@ class ExllamaHF(PreTrainedModel):
config.set_auto_map(shared.args.gpu_split) config.set_auto_map(shared.args.gpu_split)
config.gpu_peer_fix = True config.gpu_peer_fix = True
if shared.args.alpha_value: if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0:
config.alpha_value = shared.args.alpha_value config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)
config.calculate_rotary_embedding_base() config.calculate_rotary_embedding_base()
if torch.version.hip: if torch.version.hip:

View File

@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared from modules import RoPE, shared
from modules.logging_colors import logger from modules.logging_colors import logger
import llama_cpp import llama_cpp
@ -185,7 +185,7 @@ class LlamacppHF(PreTrainedModel):
'mul_mat_q': shared.args.mul_mat_q, 'mul_mat_q': shared.args.mul_mat_q,
'low_vram': shared.args.low_vram, 'low_vram': shared.args.low_vram,
'n_gpu_layers': shared.args.n_gpu_layers, 'n_gpu_layers': shared.args.n_gpu_layers,
'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.), 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
'tensor_split': tensor_split_list, 'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
'n_gqa': shared.args.n_gqa or None, 'n_gqa': shared.args.n_gqa or None,

View File

@ -3,7 +3,7 @@ from functools import partial
import torch import torch
from modules import shared from modules import RoPE, shared
from modules.callbacks import Iteratorize from modules.callbacks import Iteratorize
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length from modules.text_generation import get_max_prompt_length
@ -72,7 +72,7 @@ class LlamaCppModel:
'mul_mat_q': shared.args.mul_mat_q, 'mul_mat_q': shared.args.mul_mat_q,
'low_vram': shared.args.low_vram, 'low_vram': shared.args.low_vram,
'n_gpu_layers': shared.args.n_gpu_layers, 'n_gpu_layers': shared.args.n_gpu_layers,
'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.), 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
'tensor_split': tensor_split_list, 'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
'n_gqa': shared.args.n_gqa or None, 'n_gqa': shared.args.n_gqa or None,

View File

@ -21,6 +21,7 @@ loaders_and_params = OrderedDict({
'compute_dtype', 'compute_dtype',
'trust_remote_code', 'trust_remote_code',
'alpha_value', 'alpha_value',
'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'transformers_info' 'transformers_info'
], ],
@ -28,6 +29,7 @@ loaders_and_params = OrderedDict({
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',
'alpha_value', 'alpha_value',
'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'cfg_cache', 'cfg_cache',
'exllama_HF_info', 'exllama_HF_info',
@ -36,6 +38,7 @@ loaders_and_params = OrderedDict({
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',
'alpha_value', 'alpha_value',
'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'exllama_info', 'exllama_info',
], ],
@ -77,6 +80,7 @@ loaders_and_params = OrderedDict({
'mul_mat_q', 'mul_mat_q',
'llama_cpp_seed', 'llama_cpp_seed',
'alpha_value', 'alpha_value',
'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'cpu', 'cpu',
], ],
@ -93,6 +97,7 @@ loaders_and_params = OrderedDict({
'mlock', 'mlock',
'mul_mat_q', 'mul_mat_q',
'alpha_value', 'alpha_value',
'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'cpu', 'cpu',
'cfg_cache', 'cfg_cache',

View File

@ -18,7 +18,7 @@ from transformers import (
) )
import modules.shared as shared import modules.shared as shared
from modules import llama_attn_hijack, sampler_hijack from modules import llama_attn_hijack, RoPE, sampler_hijack
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models_settings import infer_loader from modules.models_settings import infer_loader
@ -219,7 +219,7 @@ def huggingface_loader(model_name):
if shared.args.compress_pos_emb > 1: if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1: elif shared.args.alpha_value > 1:
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
model = LoaderClass.from_pretrained(checkpoint, **params) model = LoaderClass.from_pretrained(checkpoint, **params)

View File

@ -159,8 +159,9 @@ parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The s
parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
# RoPE # RoPE
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.") parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.")
parser.add_argument('--rope_freq_base', type=int, default=1, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)")
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
# Gradio # Gradio
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.') parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')

View File

@ -79,7 +79,8 @@ def list_model_elements():
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',
'compress_pos_emb', 'compress_pos_emb',
'alpha_value' 'alpha_value',
'rope_freq_base'
] ]
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):

View File

@ -91,7 +91,8 @@ def create_ui():
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len) shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length).', value=shared.args.compress_pos_emb) shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=100000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
with gr.Column(): with gr.Column():
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)