Merge pull request #4475 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2023-11-04 14:19:55 -03:00 committed by GitHub
commit b5c53041b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 177 additions and 59 deletions

1
.gitignore vendored
View File

@ -26,6 +26,7 @@
.DS_Store .DS_Store
.eslintrc.js .eslintrc.js
.idea .idea
.env
.venv .venv
venv venv
.vscode .vscode

View File

@ -300,6 +300,7 @@ Optionally, you can use the following command-line flags:
| `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. | | `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. |
| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | | `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
| `--use_fast` | Set `use_fast=True` while loading the tokenizer. | | `--use_fast` | Set `use_fast=True` while loading the tokenizer. |
| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. |
#### Accelerate 4-bit #### Accelerate 4-bit
@ -336,6 +337,8 @@ Optionally, you can use the following command-line flags:
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. | |`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. | |`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. | |`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
|`--no_flash_attn` | Force flash-attention to not be used. |
|`--cache_8bit` | Use 8-bit cache to save VRAM. |
#### AutoGPTQ #### AutoGPTQ

View File

@ -25,7 +25,9 @@ def build_parameters(body, chat=False):
'max_tokens_second': int(body.get('max_tokens_second', 0)), 'max_tokens_second': int(body.get('max_tokens_second', 0)),
'do_sample': bool(body.get('do_sample', True)), 'do_sample': bool(body.get('do_sample', True)),
'temperature': float(body.get('temperature', 0.5)), 'temperature': float(body.get('temperature', 0.5)),
'temperature_last': bool(body.get('temperature_last', False)),
'top_p': float(body.get('top_p', 1)), 'top_p': float(body.get('top_p', 1)),
'min_p': float(body.get('min_p', 0)),
'typical_p': float(body.get('typical_p', body.get('typical', 1))), 'typical_p': float(body.get('typical_p', body.get('typical', 1))),
'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)), 'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),
'eta_cutoff': float(body.get('eta_cutoff', 0)), 'eta_cutoff': float(body.get('eta_cutoff', 0)),

View File

@ -45,9 +45,6 @@
.*starchat-beta: .*starchat-beta:
instruction_template: 'Starchat-Beta' instruction_template: 'Starchat-Beta'
custom_stopping_strings: '"<|end|>"' custom_stopping_strings: '"<|end|>"'
.*(openorca-platypus2):
instruction_template: 'OpenOrca-Platypus2'
custom_stopping_strings: '"### Instruction:", "### Response:"'
(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna: (?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
instruction_template: 'Vicuna-v0' instruction_template: 'Vicuna-v0'
.*vicuna.*v0: .*vicuna.*v0:
@ -152,6 +149,9 @@
instruction_template: 'Orca Mini' instruction_template: 'Orca Mini'
.*(platypus|gplatty|superplatty): .*(platypus|gplatty|superplatty):
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*(openorca-platypus2):
instruction_template: 'OpenOrca-Platypus2'
custom_stopping_strings: '"### Instruction:", "### Response:"'
.*longchat: .*longchat:
instruction_template: 'Vicuna-v1.1' instruction_template: 'Vicuna-v1.1'
.*vicuna-33b: .*vicuna-33b:

View File

@ -62,7 +62,7 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
from safetensors.torch import load_file as safe_load from safetensors.torch import load_file as safe_load
model.load_state_dict(safe_load(checkpoint), strict=False) model.load_state_dict(safe_load(checkpoint), strict=False)
else: else:
model.load_state_dict(torch.load(checkpoint), strict=False) model.load_state_dict(torch.load(checkpoint, weights_only=True), strict=False)
model.seqlen = 2048 model.seqlen = 2048
return model return model

View File

@ -6,6 +6,7 @@ import torch
from exllamav2 import ( from exllamav2 import (
ExLlamaV2, ExLlamaV2,
ExLlamaV2Cache, ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Config, ExLlamaV2Config,
ExLlamaV2Tokenizer ExLlamaV2Tokenizer
) )
@ -46,6 +47,7 @@ class Exllamav2Model:
config.max_seq_len = shared.args.max_seq_len config.max_seq_len = shared.args.max_seq_len
config.scale_pos_emb = shared.args.compress_pos_emb config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn
model = ExLlamaV2(config) model = ExLlamaV2(config)
@ -56,7 +58,11 @@ class Exllamav2Model:
model.load(split) model.load(split)
tokenizer = ExLlamaV2Tokenizer(config) tokenizer = ExLlamaV2Tokenizer(config)
cache = ExLlamaV2Cache(model) if shared.args.cache_8bit:
cache = ExLlamaV2Cache_8bit(model)
else:
cache = ExLlamaV2Cache(model)
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer) generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
result = self() result = self()

View File

@ -4,7 +4,12 @@ from pathlib import Path
from typing import Any, Dict, Optional, Union from typing import Any, Dict, Optional, Union
import torch import torch
from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config from exllamav2 import (
ExLlamaV2,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Config
)
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_outputs import CausalLMOutputWithPast
@ -40,11 +45,18 @@ class Exllamav2HF(PreTrainedModel):
self.generation_config = GenerationConfig() self.generation_config = GenerationConfig()
self.loras = None self.loras = None
self.ex_cache = ExLlamaV2Cache(self.ex_model) if shared.args.cache_8bit:
self.past_seq = None self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
else:
self.ex_cache = ExLlamaV2Cache(self.ex_model)
self.past_seq = None
if shared.args.cfg_cache: if shared.args.cfg_cache:
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model) if shared.args.cache_8bit:
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
else:
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
self.past_seq_negative = None self.past_seq_negative = None
def _validate_model_class(self): def _validate_model_class(self):
@ -152,5 +164,6 @@ class Exllamav2HF(PreTrainedModel):
config.max_seq_len = shared.args.max_seq_len config.max_seq_len = shared.args.max_seq_len
config.scale_pos_emb = shared.args.compress_pos_emb config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn
return Exllamav2HF(config) return Exllamav2HF(config)

View File

@ -9,7 +9,6 @@ loaders_and_params = OrderedDict({
'Transformers': [ 'Transformers': [
'cpu_memory', 'cpu_memory',
'gpu_memory', 'gpu_memory',
'trust_remote_code',
'load_in_8bit', 'load_in_8bit',
'bf16', 'bf16',
'cpu', 'cpu',
@ -21,6 +20,7 @@ loaders_and_params = OrderedDict({
'compute_dtype', 'compute_dtype',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'use_fast',
'use_flash_attention_2',
'alpha_value', 'alpha_value',
'rope_freq_base', 'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
@ -41,6 +41,8 @@ loaders_and_params = OrderedDict({
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',
'cfg_cache', 'cfg_cache',
'no_flash_attn',
'cache_8bit',
'alpha_value', 'alpha_value',
'compress_pos_emb', 'compress_pos_emb',
'use_fast', 'use_fast',
@ -56,6 +58,8 @@ loaders_and_params = OrderedDict({
'ExLlamav2': [ 'ExLlamav2': [
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',
'no_flash_attn',
'cache_8bit',
'alpha_value', 'alpha_value',
'compress_pos_emb', 'compress_pos_emb',
], ],
@ -144,7 +148,9 @@ loaders_and_params = OrderedDict({
loaders_samplers = { loaders_samplers = {
'Transformers': { 'Transformers': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -179,7 +185,9 @@ loaders_samplers = {
}, },
'ExLlama_HF': { 'ExLlama_HF': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -239,7 +247,9 @@ loaders_samplers = {
}, },
'ExLlamav2_HF': { 'ExLlamav2_HF': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -270,7 +280,9 @@ loaders_samplers = {
}, },
'AutoGPTQ': { 'AutoGPTQ': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -305,7 +317,9 @@ loaders_samplers = {
}, },
'GPTQ-for-LLaMa': { 'GPTQ-for-LLaMa': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -356,7 +370,9 @@ loaders_samplers = {
}, },
'llamacpp_HF': { 'llamacpp_HF': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',
@ -394,7 +410,9 @@ loaders_samplers = {
}, },
'AutoAWQ': { 'AutoAWQ': {
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',

View File

@ -123,8 +123,13 @@ def huggingface_loader(model_name):
params = { params = {
'low_cpu_mem_usage': True, 'low_cpu_mem_usage': True,
'trust_remote_code': shared.args.trust_remote_code, 'trust_remote_code': shared.args.trust_remote_code,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
'use_safetensors': True if shared.args.force_safetensors else None
} }
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code']) config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
if 'chatglm' in model_name.lower(): if 'chatglm' in model_name.lower():

View File

@ -8,7 +8,9 @@ def default_preset():
return { return {
'do_sample': True, 'do_sample': True,
'temperature': 1, 'temperature': 1,
'temperature_last': False,
'top_p': 1, 'top_p': 1,
'min_p': 0,
'top_k': 0, 'top_k': 0,
'typical_p': 1, 'typical_p': 1,
'epsilon_cutoff': 0, 'epsilon_cutoff': 0,

View File

@ -13,6 +13,36 @@ from transformers.generation.logits_process import (
global_scores = None global_scores = None
class MinPLogitsWarper(LogitsWarper):
def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
if min_p < 0 or min_p > 1.0:
raise ValueError(f"`min_p` has to be a float >= 0 and <= 1, but is {min_p}")
self.min_p = min_p
self.filter_value = filter_value
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
# Convert logits to probabilities
probs = torch.softmax(scores, dim=-1)
# Get the probability of the top token for each sequence in the batch
top_probs, _ = probs.max(dim=-1, keepdim=True)
# Calculate the actual min_p threshold by scaling min_p with the top token's probability
scaled_min_p = self.min_p * top_probs
# Create a mask for tokens that have a probability less than the scaled min_p
tokens_to_remove = probs < scaled_min_p
sorted_indices = torch.argsort(scores, descending=True, dim=-1)
sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices)
if self.min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep
sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
scores = scores.masked_fill(indices_to_remove, self.filter_value)
return scores
class TailFreeLogitsWarper(LogitsWarper): class TailFreeLogitsWarper(LogitsWarper):
def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
tfs = float(tfs) tfs = float(tfs)
@ -186,17 +216,36 @@ def get_logits_warper_patch(self, generation_config):
if not isinstance(warper, TemperatureLogitsWarper): if not isinstance(warper, TemperatureLogitsWarper):
warpers.remove(warper) warpers.remove(warper)
else: else:
if generation_config.tfs is not None and 0.0 <= generation_config.tfs <= 1.0: if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0:
warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep)) warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep))
if generation_config.top_a is not None and 0.0 <= generation_config.top_a <= 1.0: if generation_config.top_a is not None and 0.0 < generation_config.top_a <= 1.0:
warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep)) warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep))
if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0:
warpers_to_add.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
if warpers and isinstance(warpers[-1], LogitNormalization): if len(warpers) > 0 and isinstance(warpers[-1], LogitNormalization):
warpers = warpers[:-1] + warpers_to_add + [warpers[-1]] normalize = warpers.pop(-1)
else: else:
warpers += warpers_to_add normalize = None
warpers += warpers_to_add
if generation_config.temperature_last:
temperature_idx = None
for i in range(len(warpers)):
if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper':
temperature_idx = i
break
if temperature_idx is not None:
warpers = warpers[:temperature_idx] + warpers[temperature_idx + 1:] + [warpers[temperature_idx]]
warpers = LogitsProcessorList(warpers)
if normalize is not None:
warpers.append(normalize)
warpers.append(SpyLogitsWarper()) warpers.append(SpyLogitsWarper())
# for i in range(len(warpers)):
# print(warpers[i].__class__.__name__)
return warpers return warpers
@ -222,6 +271,7 @@ def get_logits_processor_patch(self, **kwargs):
def generation_config_init_patch(self, **kwargs): def generation_config_init_patch(self, **kwargs):
self.__init___old(**kwargs) self.__init___old(**kwargs)
self.min_p = kwargs.pop("min_p", 0.0)
self.tfs = kwargs.pop("tfs", 1.0) self.tfs = kwargs.pop("tfs", 1.0)
self.top_a = kwargs.pop("top_a", 0.0) self.top_a = kwargs.pop("top_a", 0.0)
self.mirostat_mode = kwargs.pop("mirostat_mode", 0) self.mirostat_mode = kwargs.pop("mirostat_mode", 0)
@ -230,6 +280,7 @@ def generation_config_init_patch(self, **kwargs):
self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0) self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0)
self.presence_penalty = kwargs.pop("presence_penalty", 0) self.presence_penalty = kwargs.pop("presence_penalty", 0)
self.frequency_penalty = kwargs.pop("frequency_penalty", 0) self.frequency_penalty = kwargs.pop("frequency_penalty", 0)
self.temperature_last = kwargs.pop("temperature_last", False)
def hijack_samplers(): def hijack_samplers():

View File

@ -91,7 +91,9 @@ parser.add_argument('--no-cache', action='store_true', help='Set use_cache to Fa
parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.') parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.') parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
# Accelerate 4-bit # Accelerate 4-bit
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
@ -117,6 +119,8 @@ parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (
parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') parser.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') parser.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.') parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
parser.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
parser.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
# AutoGPTQ # AutoGPTQ
parser.add_argument('--triton', action='store_true', help='Use triton.') parser.add_argument('--triton', action='store_true', help='Use triton.')

View File

@ -274,7 +274,7 @@ def apply_stopping_strings(reply, all_stop_strings):
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False): def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
generate_params = {} generate_params = {}
for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']: for k in ['max_new_tokens', 'do_sample', 'temperature', 'temperature_last', 'top_p', 'min_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
generate_params[k] = state[k] generate_params[k] = state[k]
if state['negative_prompt'] != '': if state['negative_prompt'] != '':

View File

@ -544,7 +544,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
lora_model = get_peft_model(shared.model, config) lora_model = get_peft_model(shared.model, config)
if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file(): if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
logger.info("Loading existing LoRA data...") logger.info("Loading existing LoRA data...")
state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin") state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
set_peft_model_state_dict(lora_model, state_dict_peft) set_peft_model_state_dict(lora_model, state_dict_peft)
except: except:
yield traceback.format_exc().replace('\n', '\n\n') yield traceback.format_exc().replace('\n', '\n\n')

View File

@ -53,6 +53,7 @@ def list_model_elements():
'load_in_8bit', 'load_in_8bit',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'use_fast',
'use_flash_attention_2',
'load_in_4bit', 'load_in_4bit',
'compute_dtype', 'compute_dtype',
'quant_type', 'quant_type',
@ -68,6 +69,8 @@ def list_model_elements():
'no_use_cuda_fp16', 'no_use_cuda_fp16',
'disable_exllama', 'disable_exllama',
'cfg_cache', 'cfg_cache',
'no_flash_attn',
'cache_8bit',
'threads', 'threads',
'threads_batch', 'threads_batch',
'n_batch', 'n_batch',
@ -102,7 +105,9 @@ def list_interface_input_elements():
'max_tokens_second', 'max_tokens_second',
'seed', 'seed',
'temperature', 'temperature',
'temperature_last',
'top_p', 'top_p',
'min_p',
'top_k', 'top_k',
'typical_p', 'typical_p',
'epsilon_cutoff', 'epsilon_cutoff',

View File

@ -86,7 +86,7 @@ def create_ui():
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type) shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers) shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx) shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx)
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch)
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch) shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
@ -97,7 +97,7 @@ def create_ui():
shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.') shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=32768, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len) shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
@ -124,9 +124,12 @@ def create_ui():
shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).') shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.') shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.') shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.')
@ -212,6 +215,9 @@ def load_model_wrapper(selected_model, loader, autoload=False):
if 'instruction_template' in settings: if 'instruction_template' in settings:
output += '\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template']) output += '\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])
# Applying the changes to the global shared settings (in-memory)
shared.settings.update({k: v for k, v in settings.items() if k in shared.settings})
yield output yield output
else: else:
yield f"Failed to load `{selected_model}`." yield f"Failed to load `{selected_model}`."

View File

@ -29,9 +29,10 @@ def create_ui(default_preset):
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature') shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature')
shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p') shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k') shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty') shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
shared.gradio['presence_penalty'] = gr.Slider(0, 4, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty') shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty') shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range') shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p') shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
@ -47,6 +48,7 @@ def create_ui(default_preset):
shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.') shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau') shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta') shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Makes temperature the last sampler instead of the first.')
shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)') shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
with gr.Accordion('Other parameters', open=False): with gr.Accordion('Other parameters', open=False):
@ -57,7 +59,7 @@ def create_ui(default_preset):
shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty') shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping') shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Generation-Parameters.md)") gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
with gr.Column(): with gr.Column():
with gr.Row(): with gr.Row():

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.7; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb
@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
@ -84,4 +84,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
autoawq==0.1.4; platform_system == "Linux" or platform_system == "Windows" autoawq==0.1.6; platform_system == "Linux" or platform_system == "Windows"

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6; platform_system != "Darwin" and platform_machine != "x86_64" exllamav2==0.0.7; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb
@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/turboderp/exllamav2/releases/download/v0.0.6/exllamav2-0.0.6+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.7/exllamav2-0.0.7+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/bdashore3/flash-attention/releases/download/2.3.2-2/flash_attn-2.3.2+cu122-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
@ -84,4 +84,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
autoawq==0.1.4; platform_system == "Linux" or platform_system == "Windows" autoawq==0.1.6; platform_system == "Linux" or platform_system == "Windows"

View File

@ -2,7 +2,7 @@ accelerate==0.24.*
colorama colorama
datasets datasets
einops einops
exllamav2==0.0.6 exllamav2==0.0.7
gradio==3.50.* gradio==3.50.*
markdown markdown
numpy==1.24.* numpy==1.24.*
@ -16,7 +16,7 @@ safetensors==0.4.0
scipy scipy
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.34.* transformers==4.35.*
tqdm tqdm
wandb wandb