mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-01-24 18:49:23 +01:00
commit
b5c53041b8
1
.gitignore
vendored
1
.gitignore
vendored
@ -26,6 +26,7 @@
|
||||
.DS_Store
|
||||
.eslintrc.js
|
||||
.idea
|
||||
.env
|
||||
.venv
|
||||
venv
|
||||
.vscode
|
||||
|
@ -300,6 +300,7 @@ Optionally, you can use the following command-line flags:
|
||||
| `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. |
|
||||
| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
|
||||
| `--use_fast` | Set `use_fast=True` while loading the tokenizer. |
|
||||
| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. |
|
||||
|
||||
#### Accelerate 4-bit
|
||||
|
||||
@ -336,6 +337,8 @@ Optionally, you can use the following command-line flags:
|
||||
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
|
||||
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|
||||
|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
|
||||
|`--no_flash_attn` | Force flash-attention to not be used. |
|
||||
|`--cache_8bit` | Use 8-bit cache to save VRAM. |
|
||||
|
||||
#### AutoGPTQ
|
||||
|
||||
|
@ -25,7 +25,9 @@ def build_parameters(body, chat=False):
|
||||
'max_tokens_second': int(body.get('max_tokens_second', 0)),
|
||||
'do_sample': bool(body.get('do_sample', True)),
|
||||
'temperature': float(body.get('temperature', 0.5)),
|
||||
'temperature_last': bool(body.get('temperature_last', False)),
|
||||
'top_p': float(body.get('top_p', 1)),
|
||||
'min_p': float(body.get('min_p', 0)),
|
||||
'typical_p': float(body.get('typical_p', body.get('typical', 1))),
|
||||
'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),
|
||||
'eta_cutoff': float(body.get('eta_cutoff', 0)),
|
||||
|
@ -45,9 +45,6 @@
|
||||
.*starchat-beta:
|
||||
instruction_template: 'Starchat-Beta'
|
||||
custom_stopping_strings: '"<|end|>"'
|
||||
.*(openorca-platypus2):
|
||||
instruction_template: 'OpenOrca-Platypus2'
|
||||
custom_stopping_strings: '"### Instruction:", "### Response:"'
|
||||
(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
|
||||
instruction_template: 'Vicuna-v0'
|
||||
.*vicuna.*v0:
|
||||
@ -152,6 +149,9 @@
|
||||
instruction_template: 'Orca Mini'
|
||||
.*(platypus|gplatty|superplatty):
|
||||
instruction_template: 'Alpaca'
|
||||
.*(openorca-platypus2):
|
||||
instruction_template: 'OpenOrca-Platypus2'
|
||||
custom_stopping_strings: '"### Instruction:", "### Response:"'
|
||||
.*longchat:
|
||||
instruction_template: 'Vicuna-v1.1'
|
||||
.*vicuna-33b:
|
||||
|
@ -62,7 +62,7 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
|
||||
from safetensors.torch import load_file as safe_load
|
||||
model.load_state_dict(safe_load(checkpoint), strict=False)
|
||||
else:
|
||||
model.load_state_dict(torch.load(checkpoint), strict=False)
|
||||
model.load_state_dict(torch.load(checkpoint, weights_only=True), strict=False)
|
||||
|
||||
model.seqlen = 2048
|
||||
return model
|
||||
|
@ -6,6 +6,7 @@ import torch
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Cache_8bit,
|
||||
ExLlamaV2Config,
|
||||
ExLlamaV2Tokenizer
|
||||
)
|
||||
@ -46,6 +47,7 @@ class Exllamav2Model:
|
||||
config.max_seq_len = shared.args.max_seq_len
|
||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||
config.scale_alpha_value = shared.args.alpha_value
|
||||
config.no_flash_attn = shared.args.no_flash_attn
|
||||
|
||||
model = ExLlamaV2(config)
|
||||
|
||||
@ -56,7 +58,11 @@ class Exllamav2Model:
|
||||
model.load(split)
|
||||
|
||||
tokenizer = ExLlamaV2Tokenizer(config)
|
||||
cache = ExLlamaV2Cache(model)
|
||||
if shared.args.cache_8bit:
|
||||
cache = ExLlamaV2Cache_8bit(model)
|
||||
else:
|
||||
cache = ExLlamaV2Cache(model)
|
||||
|
||||
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
|
||||
|
||||
result = self()
|
||||
|
@ -4,7 +4,12 @@ from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Cache_8bit,
|
||||
ExLlamaV2Config
|
||||
)
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||
@ -40,11 +45,18 @@ class Exllamav2HF(PreTrainedModel):
|
||||
self.generation_config = GenerationConfig()
|
||||
self.loras = None
|
||||
|
||||
self.ex_cache = ExLlamaV2Cache(self.ex_model)
|
||||
self.past_seq = None
|
||||
if shared.args.cache_8bit:
|
||||
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
|
||||
else:
|
||||
self.ex_cache = ExLlamaV2Cache(self.ex_model)
|
||||
|
||||
self.past_seq = None
|
||||
if shared.args.cfg_cache:
|
||||
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
|
||||
if shared.args.cache_8bit:
|
||||
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
|
||||
else:
|
||||
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
|
||||
|
||||
self.past_seq_negative = None
|
||||
|
||||
def _validate_model_class(self):
|
||||
@ -152,5 +164,6 @@ class Exllamav2HF(PreTrainedModel):
|
||||
config.max_seq_len = shared.args.max_seq_len
|
||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||
config.scale_alpha_value = shared.args.alpha_value
|
||||
config.no_flash_attn = shared.args.no_flash_attn
|
||||
|
||||
return Exllamav2HF(config)
|
||||
|
@ -9,7 +9,6 @@ loaders_and_params = OrderedDict({
|
||||
'Transformers': [
|
||||
'cpu_memory',
|
||||
'gpu_memory',
|
||||
'trust_remote_code',
|
||||
'load_in_8bit',
|
||||
'bf16',
|
||||
'cpu',
|
||||
@ -21,6 +20,7 @@ loaders_and_params = OrderedDict({
|
||||
'compute_dtype',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'use_flash_attention_2',
|
||||
'alpha_value',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
@ -41,6 +41,8 @@ loaders_and_params = OrderedDict({
|
||||
'gpu_split',
|
||||
'max_seq_len',
|
||||
'cfg_cache',
|
||||
'no_flash_attn',
|
||||
'cache_8bit',
|
||||
'alpha_value',
|
||||
'compress_pos_emb',
|
||||
'use_fast',
|
||||
@ -56,6 +58,8 @@ loaders_and_params = OrderedDict({
|
||||
'ExLlamav2': [
|
||||
'gpu_split',
|
||||
'max_seq_len',
|
||||
'no_flash_attn',
|
||||
'cache_8bit',
|
||||
'alpha_value',
|
||||
'compress_pos_emb',
|
||||
],
|
||||
@ -144,7 +148,9 @@ loaders_and_params = OrderedDict({
|
||||
loaders_samplers = {
|
||||
'Transformers': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -179,7 +185,9 @@ loaders_samplers = {
|
||||
},
|
||||
'ExLlama_HF': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -239,7 +247,9 @@ loaders_samplers = {
|
||||
},
|
||||
'ExLlamav2_HF': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -270,7 +280,9 @@ loaders_samplers = {
|
||||
},
|
||||
'AutoGPTQ': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -305,7 +317,9 @@ loaders_samplers = {
|
||||
},
|
||||
'GPTQ-for-LLaMa': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -356,7 +370,9 @@ loaders_samplers = {
|
||||
},
|
||||
'llamacpp_HF': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
@ -394,7 +410,9 @@ loaders_samplers = {
|
||||
},
|
||||
'AutoAWQ': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
|
@ -123,8 +123,13 @@ def huggingface_loader(model_name):
|
||||
params = {
|
||||
'low_cpu_mem_usage': True,
|
||||
'trust_remote_code': shared.args.trust_remote_code,
|
||||
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16
|
||||
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
|
||||
'use_safetensors': True if shared.args.force_safetensors else None
|
||||
}
|
||||
|
||||
if shared.args.use_flash_attention_2:
|
||||
params['use_flash_attention_2'] = True
|
||||
|
||||
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
|
||||
|
||||
if 'chatglm' in model_name.lower():
|
||||
|
@ -8,7 +8,9 @@ def default_preset():
|
||||
return {
|
||||
'do_sample': True,
|
||||
'temperature': 1,
|
||||
'temperature_last': False,
|
||||
'top_p': 1,
|
||||
'min_p': 0,
|
||||
'top_k': 0,
|
||||
'typical_p': 1,
|
||||
'epsilon_cutoff': 0,
|
||||
|
@ -13,6 +13,36 @@ from transformers.generation.logits_process import (
|
||||
global_scores = None
|
||||
|
||||
|
||||
class MinPLogitsWarper(LogitsWarper):
|
||||
def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
|
||||
if min_p < 0 or min_p > 1.0:
|
||||
raise ValueError(f"`min_p` has to be a float >= 0 and <= 1, but is {min_p}")
|
||||
self.min_p = min_p
|
||||
self.filter_value = filter_value
|
||||
self.min_tokens_to_keep = min_tokens_to_keep
|
||||
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
||||
# Convert logits to probabilities
|
||||
probs = torch.softmax(scores, dim=-1)
|
||||
# Get the probability of the top token for each sequence in the batch
|
||||
top_probs, _ = probs.max(dim=-1, keepdim=True)
|
||||
# Calculate the actual min_p threshold by scaling min_p with the top token's probability
|
||||
scaled_min_p = self.min_p * top_probs
|
||||
# Create a mask for tokens that have a probability less than the scaled min_p
|
||||
tokens_to_remove = probs < scaled_min_p
|
||||
|
||||
sorted_indices = torch.argsort(scores, descending=True, dim=-1)
|
||||
sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices)
|
||||
|
||||
if self.min_tokens_to_keep > 1:
|
||||
# Keep at least min_tokens_to_keep
|
||||
sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False
|
||||
|
||||
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
||||
scores = scores.masked_fill(indices_to_remove, self.filter_value)
|
||||
return scores
|
||||
|
||||
|
||||
class TailFreeLogitsWarper(LogitsWarper):
|
||||
def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
|
||||
tfs = float(tfs)
|
||||
@ -186,17 +216,36 @@ def get_logits_warper_patch(self, generation_config):
|
||||
if not isinstance(warper, TemperatureLogitsWarper):
|
||||
warpers.remove(warper)
|
||||
else:
|
||||
if generation_config.tfs is not None and 0.0 <= generation_config.tfs <= 1.0:
|
||||
if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0:
|
||||
warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.top_a is not None and 0.0 <= generation_config.top_a <= 1.0:
|
||||
if generation_config.top_a is not None and 0.0 < generation_config.top_a <= 1.0:
|
||||
warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep))
|
||||
if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0:
|
||||
warpers_to_add.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
|
||||
|
||||
if warpers and isinstance(warpers[-1], LogitNormalization):
|
||||
warpers = warpers[:-1] + warpers_to_add + [warpers[-1]]
|
||||
if len(warpers) > 0 and isinstance(warpers[-1], LogitNormalization):
|
||||
normalize = warpers.pop(-1)
|
||||
else:
|
||||
warpers += warpers_to_add
|
||||
normalize = None
|
||||
|
||||
warpers += warpers_to_add
|
||||
if generation_config.temperature_last:
|
||||
temperature_idx = None
|
||||
for i in range(len(warpers)):
|
||||
if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper':
|
||||
temperature_idx = i
|
||||
break
|
||||
|
||||
if temperature_idx is not None:
|
||||
warpers = warpers[:temperature_idx] + warpers[temperature_idx + 1:] + [warpers[temperature_idx]]
|
||||
warpers = LogitsProcessorList(warpers)
|
||||
|
||||
if normalize is not None:
|
||||
warpers.append(normalize)
|
||||
|
||||
warpers.append(SpyLogitsWarper())
|
||||
# for i in range(len(warpers)):
|
||||
# print(warpers[i].__class__.__name__)
|
||||
return warpers
|
||||
|
||||
|
||||
@ -222,6 +271,7 @@ def get_logits_processor_patch(self, **kwargs):
|
||||
|
||||
def generation_config_init_patch(self, **kwargs):
|
||||
self.__init___old(**kwargs)
|
||||
self.min_p = kwargs.pop("min_p", 0.0)
|
||||
self.tfs = kwargs.pop("tfs", 1.0)
|
||||
self.top_a = kwargs.pop("top_a", 0.0)
|
||||
self.mirostat_mode = kwargs.pop("mirostat_mode", 0)
|
||||
@ -230,6 +280,7 @@ def generation_config_init_patch(self, **kwargs):
|
||||
self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0)
|
||||
self.presence_penalty = kwargs.pop("presence_penalty", 0)
|
||||
self.frequency_penalty = kwargs.pop("frequency_penalty", 0)
|
||||
self.temperature_last = kwargs.pop("temperature_last", False)
|
||||
|
||||
|
||||
def hijack_samplers():
|
||||
|
@ -91,7 +91,9 @@ parser.add_argument('--no-cache', action='store_true', help='Set use_cache to Fa
|
||||
parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
|
||||
parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
|
||||
parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
|
||||
parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
|
||||
parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
|
||||
parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
|
||||
|
||||
# Accelerate 4-bit
|
||||
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
|
||||
@ -117,6 +119,8 @@ parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (
|
||||
parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
|
||||
parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
|
||||
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
|
||||
parser.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
|
||||
parser.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
|
||||
|
||||
# AutoGPTQ
|
||||
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
||||
|
@ -274,7 +274,7 @@ def apply_stopping_strings(reply, all_stop_strings):
|
||||
|
||||
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
||||
generate_params = {}
|
||||
for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
|
||||
for k in ['max_new_tokens', 'do_sample', 'temperature', 'temperature_last', 'top_p', 'min_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
|
||||
generate_params[k] = state[k]
|
||||
|
||||
if state['negative_prompt'] != '':
|
||||
|
@ -544,7 +544,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
|
||||
lora_model = get_peft_model(shared.model, config)
|
||||
if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
|
||||
logger.info("Loading existing LoRA data...")
|
||||
state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
|
||||
state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
|
||||
set_peft_model_state_dict(lora_model, state_dict_peft)
|
||||
except:
|
||||
yield traceback.format_exc().replace('\n', '\n\n')
|
||||
|
@ -53,6 +53,7 @@ def list_model_elements():
|
||||
'load_in_8bit',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'use_flash_attention_2',
|
||||
'load_in_4bit',
|
||||
'compute_dtype',
|
||||
'quant_type',
|
||||
@ -68,6 +69,8 @@ def list_model_elements():
|
||||
'no_use_cuda_fp16',
|
||||
'disable_exllama',
|
||||
'cfg_cache',
|
||||
'no_flash_attn',
|
||||
'cache_8bit',
|
||||
'threads',
|
||||
'threads_batch',
|
||||
'n_batch',
|
||||
@ -102,7 +105,9 @@ def list_interface_input_elements():
|
||||
'max_tokens_second',
|
||||
'seed',
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
|
@ -86,7 +86,7 @@ def create_ui():
|
||||
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
|
||||
|
||||
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
|
||||
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx)
|
||||
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx)
|
||||
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
|
||||
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch)
|
||||
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
|
||||
@ -97,7 +97,7 @@ def create_ui():
|
||||
shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
|
||||
shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
|
||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=32768, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
||||
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
||||
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
||||
shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
|
||||
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
|
||||
@ -124,9 +124,12 @@ def create_ui():
|
||||
shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
|
||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
|
||||
shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
|
||||
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
||||
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
|
||||
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
|
||||
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
|
||||
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
|
||||
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
|
||||
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
|
||||
shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
|
||||
shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.')
|
||||
|
||||
@ -212,6 +215,9 @@ def load_model_wrapper(selected_model, loader, autoload=False):
|
||||
if 'instruction_template' in settings:
|
||||
output += '\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])
|
||||
|
||||
# Applying the changes to the global shared settings (in-memory)
|
||||
shared.settings.update({k: v for k, v in settings.items() if k in shared.settings})
|
||||
|
||||
yield output
|
||||
else:
|
||||
yield f"Failed to load `{selected_model}`."
|
||||
|
@ -29,9 +29,10 @@ def create_ui(default_preset):
|
||||
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
|
||||
shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
|
||||
shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
|
||||
shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
|
||||
shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
|
||||
shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
|
||||
shared.gradio['presence_penalty'] = gr.Slider(0, 4, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
|
||||
shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
|
||||
shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
|
||||
shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
|
||||
shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
|
||||
@ -47,6 +48,7 @@ def create_ui(default_preset):
|
||||
shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
|
||||
shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
|
||||
shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
|
||||
shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Makes temperature the last sampler instead of the first.')
|
||||
shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
|
||||
shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
|
||||
with gr.Accordion('Other parameters', open=False):
|
||||
@ -57,7 +59,7 @@ def create_ui(default_preset):
|
||||
shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
|
||||
shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
|
||||
|
||||
gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Generation-Parameters.md)")
|
||||
gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
|
||||
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
exllamav2==0.0.7; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
@ -84,4 +84,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
|
||||
autoawq==0.1.4; platform_system == "Linux" or platform_system == "Windows"
|
||||
autoawq==0.1.6; platform_system == "Linux" or platform_system == "Windows"
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
exllamav2==0.0.7; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
|
||||
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
@ -84,4 +84,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
|
||||
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
|
||||
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
|
||||
autoawq==0.1.4; platform_system == "Linux" or platform_system == "Windows"
|
||||
autoawq==0.1.6; platform_system == "Linux" or platform_system == "Windows"
|
||||
|
@ -2,7 +2,7 @@ accelerate==0.24.*
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
exllamav2==0.0.6
|
||||
exllamav2==0.0.7
|
||||
gradio==3.50.*
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
@ -16,7 +16,7 @@ safetensors==0.4.0
|
||||
scipy
|
||||
sentencepiece
|
||||
tensorboard
|
||||
transformers==4.34.*
|
||||
transformers==4.35.*
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user