diff --git a/README.md b/README.md index 3df9a16f..f7e18350 100644 --- a/README.md +++ b/README.md @@ -299,12 +299,12 @@ Optionally, you can use the following command-line flags: | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | | `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | -#### RoPE (for llama.cpp and ExLlama only) +#### RoPE (for llama.cpp, ExLlama, and transformers) | Flag | Description | |------------------|-------------| |`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. | -|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. | +|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. | #### Gradio diff --git a/modules/loaders.py b/modules/loaders.py index 519e47a7..07bc455c 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -39,8 +39,8 @@ loaders_and_params = { 'low_vram', 'mlock', 'llama_cpp_seed', - 'compress_pos_emb', 'alpha_value', + 'compress_pos_emb', 'cpu', ], 'llamacpp_HF': [ @@ -54,8 +54,8 @@ loaders_and_params = { 'low_vram', 'mlock', 'llama_cpp_seed', - 'compress_pos_emb', 'alpha_value', + 'compress_pos_emb', 'cpu', 'llamacpp_HF_info', ], @@ -73,20 +73,22 @@ loaders_and_params = { 'quant_type', 'compute_dtype', 'trust_remote_code', + 'alpha_value', + 'compress_pos_emb', 'transformers_info' ], 'ExLlama': [ 'gpu_split', 'max_seq_len', - 'compress_pos_emb', 'alpha_value', + 'compress_pos_emb', 'exllama_info', ], 'ExLlama_HF': [ 'gpu_split', 'max_seq_len', - 'compress_pos_emb', 'alpha_value', + 'compress_pos_emb', 'exllama_HF_info', ] } diff --git a/modules/models.py b/modules/models.py index 4f6a44c1..aad142c1 100644 --- a/modules/models.py +++ b/modules/models.py @@ -144,7 +144,7 @@ def huggingface_loader(model_name): LoaderClass = AutoModelForCausalLM # Load the model in simple 16-bit mode by default - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]): model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code) if torch.backends.mps.is_available(): device = torch.device('mps') @@ -215,6 +215,11 @@ def huggingface_loader(model_name): no_split_module_classes=model._no_split_modules ) + if shared.args.compress_pos_emb > 1: + params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} + elif shared.args.alpha_value > 1: + params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} + model = LoaderClass.from_pretrained(checkpoint, **params) return model diff --git a/modules/shared.py b/modules/shared.py index 05c402c4..951120c8 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -164,7 +164,7 @@ parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile t # RoPE parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.") -parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.") +parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.") # Gradio parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 7961c225..55416a07 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -89,8 +89,8 @@ def create_ui(): shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len) + shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb) - shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) with gr.Column(): shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)