Merge branch 'oobabooga:main' into exllama-module

This commit is contained in:
jllllll 2023-06-21 14:15:08 -05:00 committed by GitHub
commit 6254203f84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 225 additions and 109 deletions

View File

@ -212,7 +212,7 @@ Optionally, you can use the following command-line flags:
| Flag | Description | | Flag | Description |
|--------------------------------------------|-------------| |--------------------------------------------|-------------|
| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen | | `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
#### Accelerate/transformers #### Accelerate/transformers

View File

@ -0,0 +1,4 @@
user: "<|user|>"
bot: "<|assistant|>"
context: "<|system|>\n<|end|>\n"
turn_template: "<|user|>\n<|user-message|><|end|>\n<|bot|>\n<|bot-message|><|end|>\n"

View File

@ -0,0 +1,4 @@
user: "<|user|>"
bot: "<|assistant|>"
context: ""
turn_template: "<|user|>\n<|user-message|>\n<|bot|>\n<|bot-message|>\n"

View File

@ -93,3 +93,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.message-body :not(pre) > code { .message-body :not(pre) > code {
white-space: normal !important; white-space: normal !important;
} }
@media print {
body {
visibility: hidden;
}
.chat {
visibility: visible;
position: absolute;
left: 0;
top: 0;
max-width: none;
max-height: none;
width: 100%;
height: fit-content;
display: flex;
flex-direction: column-reverse;
}
}

View File

@ -17,6 +17,10 @@
margin-bottom: 1.25em !important; margin-bottom: 1.25em !important;
} }
.message-body ul, .message-body ol {
margin-bottom: 1.25em !important;
}
.dark .message-body p em { .dark .message-body p em {
color: rgb(198, 202, 214) !important; color: rgb(198, 202, 214) !important;
} }

View File

@ -26,7 +26,7 @@ LABEL maintainer="Your Name <your.email@example.com>"
LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI" LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
RUN apt-get update && \ RUN apt-get update && \
apt-get install --no-install-recommends -y libportaudio2 libasound-dev git python3 python3-pip make g++ && \ apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv

View File

@ -1,5 +1,5 @@
''' '''
Downloads models from Hugging Face to models/model-name. Downloads models from Hugging Face to models/username_modelname.
Example: Example:
python download-model.py facebook/opt-1.3b python download-model.py facebook/opt-1.3b
@ -11,8 +11,8 @@ import base64
import datetime import datetime
import hashlib import hashlib
import json import json
import re
import os import os
import re
import sys import sys
from pathlib import Path from pathlib import Path
@ -21,63 +21,12 @@ import tqdm
from tqdm.contrib.concurrent import thread_map from tqdm.contrib.concurrent import thread_map
def select_model_from_default_options():
models = {
"OPT 6.7B": ("facebook", "opt-6.7b", "main"),
"OPT 2.7B": ("facebook", "opt-2.7b", "main"),
"OPT 1.3B": ("facebook", "opt-1.3b", "main"),
"OPT 350M": ("facebook", "opt-350m", "main"),
"GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"),
"GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"),
"GALACTICA 125M": ("facebook", "galactica-125m", "main"),
"Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"),
"Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"),
"Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"),
"Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"),
}
choices = {}
print("Select the model that you want to download:\n")
for i, name in enumerate(models):
char = chr(ord('A') + i)
choices[char] = name
print(f"{char}) {name}")
char_hugging = chr(ord('A') + len(models))
print(f"{char_hugging}) Manually specify a Hugging Face model")
char_exit = chr(ord('A') + len(models) + 1)
print(f"{char_exit}) Do not download a model")
print()
print("Input> ", end='')
choice = input()[0].strip().upper()
if choice == char_exit:
exit()
elif choice == char_hugging:
print("""\nType the name of your desired Hugging Face model in the format organization/name.
Examples:
facebook/opt-1.3b
EleutherAI/pythia-1.4b-deduped
""")
print("Input> ", end='')
model = input()
branch = "main"
else:
arr = models[choices[choice]]
model = f"{arr[0]}/{arr[1]}"
branch = arr[2]
return model, branch
class ModelDownloader: class ModelDownloader:
def __init__(self): def __init__(self):
self.s = requests.Session() self.s = requests.Session()
if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None: if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS')) self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
def sanitize_model_and_branch_names(self, model, branch): def sanitize_model_and_branch_names(self, model, branch):
if model[-1] == '/': if model[-1] == '/':
model = model[:-1] model = model[:-1]
@ -92,7 +41,6 @@ class ModelDownloader:
return model, branch return model, branch
def get_download_links_from_huggingface(self, model, branch, text_only=False): def get_download_links_from_huggingface(self, model, branch, text_only=False):
base = "https://huggingface.co" base = "https://huggingface.co"
page = f"/api/models/{model}/tree/{branch}" page = f"/api/models/{model}/tree/{branch}"
@ -163,7 +111,6 @@ class ModelDownloader:
return links, sha256, is_lora return links, sha256, is_lora
def get_output_folder(self, model, branch, is_lora, base_folder=None): def get_output_folder(self, model, branch, is_lora, base_folder=None):
if base_folder is None: if base_folder is None:
base_folder = 'models' if not is_lora else 'loras' base_folder = 'models' if not is_lora else 'loras'
@ -171,59 +118,64 @@ class ModelDownloader:
output_folder = f"{'_'.join(model.split('/')[-2:])}" output_folder = f"{'_'.join(model.split('/')[-2:])}"
if branch != 'main': if branch != 'main':
output_folder += f'_{branch}' output_folder += f'_{branch}'
output_folder = Path(base_folder) / output_folder output_folder = Path(base_folder) / output_folder
return output_folder return output_folder
def get_single_file(self, url, output_folder, start_from_scratch=False): def get_single_file(self, url, output_folder, start_from_scratch=False):
filename = Path(url.rsplit('/', 1)[1]) filename = Path(url.rsplit('/', 1)[1])
output_path = output_folder / filename output_path = output_folder / filename
headers = {}
mode = 'wb'
if output_path.exists() and not start_from_scratch: if output_path.exists() and not start_from_scratch:
# Check if the file has already been downloaded completely # Check if the file has already been downloaded completely
r = self.s.get(url, stream=True, timeout=20) r = self.s.get(url, stream=True, timeout=20)
total_size = int(r.headers.get('content-length', 0)) total_size = int(r.headers.get('content-length', 0))
if output_path.stat().st_size >= total_size: if output_path.stat().st_size >= total_size:
return return
# Otherwise, resume the download from where it left off # Otherwise, resume the download from where it left off
headers = {'Range': f'bytes={output_path.stat().st_size}-'} headers = {'Range': f'bytes={output_path.stat().st_size}-'}
mode = 'ab' mode = 'ab'
else:
headers = {}
mode = 'wb'
r = self.s.get(url, stream=True, headers=headers, timeout=20) with self.s.get(url, stream=True, headers=headers, timeout=20) as r:
with open(output_path, mode) as f: r.raise_for_status() # Do not continue the download if the request was unsuccessful
total_size = int(r.headers.get('content-length', 0)) total_size = int(r.headers.get('content-length', 0))
block_size = 1024 block_size = 1024 * 1024 # 1MB
with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: with open(output_path, mode) as f:
for data in r.iter_content(block_size): with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
t.update(len(data)) count = 0
f.write(data) for data in r.iter_content(block_size):
t.update(len(data))
f.write(data)
if total_size != 0 and self.progress_bar is not None:
count += len(data)
self.progress_bar(float(count) / float(total_size), f"Downloading {filename}")
def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1): def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1):
thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True) thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=1):
self.progress_bar = progress_bar
def download_model_files(self, model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
# Creating the folder and writing the metadata # Creating the folder and writing the metadata
if not output_folder.exists(): output_folder.mkdir(parents=True, exist_ok=True)
output_folder.mkdir(parents=True, exist_ok=True) metadata = f'url: https://huggingface.co/{model}\n' \
with open(output_folder / 'huggingface-metadata.txt', 'w') as f: f'branch: {branch}\n' \
f.write(f'url: https://huggingface.co/{model}\n') f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
f.write(f'branch: {branch}\n')
f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') sha256_str = '\n'.join([f' {item[1]} {item[0]}' for item in sha256])
sha256_str = '' if sha256_str:
for i in range(len(sha256)): metadata += f'sha256sum:\n{sha256_str}'
sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n'
if sha256_str != '': metadata += '\n'
f.write(f'sha256sum:\n{sha256_str}') (output_folder / 'huggingface-metadata.txt').write_text(metadata)
# Downloading the files # Downloading the files
print(f"Downloading the model to {output_folder}") print(f"Downloading the model to {output_folder}")
self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads) self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
def check_model_files(self, model, branch, links, sha256, output_folder): def check_model_files(self, model, branch, links, sha256, output_folder):
# Validate the checksums # Validate the checksums
validated = True validated = True
@ -264,8 +216,6 @@ if __name__ == '__main__':
branch = args.branch branch = args.branch
model = args.MODEL model = args.MODEL
if model is None:
model, branch = select_model_from_default_options()
downloader = ModelDownloader() downloader = ModelDownloader()
# Cleaning up the model/branch names # Cleaning up the model/branch names

View File

@ -50,7 +50,7 @@ llama-65b-gptq-3bit:
.*vicuna.*v0: .*vicuna.*v0:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Vicuna-v0' instruction_template: 'Vicuna-v0'
.*vicuna.*(1.1|1_1): .*vicuna.*(1.1|1_1|1.3|1_3):
mode: 'instruct' mode: 'instruct'
instruction_template: 'Vicuna-v1.1' instruction_template: 'Vicuna-v1.1'
.*wizard.*vicuna: .*wizard.*vicuna:
@ -184,7 +184,7 @@ llama-65b-gptq-3bit:
.*Nous-Hermes-13b: .*Nous-Hermes-13b:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*airoboros-13b-gpt4: .*airoboros:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Vicuna-v1.1' instruction_template: 'Vicuna-v1.1'
.*WizardLM-30B-V1.0: .*WizardLM-30B-V1.0:
@ -193,7 +193,7 @@ llama-65b-gptq-3bit:
TheBloke_WizardLM-30B-GPTQ: TheBloke_WizardLM-30B-GPTQ:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Vicuna-v1.1' instruction_template: 'Vicuna-v1.1'
.*(A|a)lpa(cino|sta): .*alpa(cino|sta):
mode: 'instruct' mode: 'instruct'
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*hippogriff: .*hippogriff:
@ -202,9 +202,33 @@ TheBloke_WizardLM-30B-GPTQ:
.*gpt4all-.*-snoozy: .*gpt4all-.*-snoozy:
mode: 'instruct' mode: 'instruct'
instruction_template: 'WizardLM' instruction_template: 'WizardLM'
.*(L|l)azarus: .*lazarus:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*(G|g)uanaco-.*(7|13|33|65)(b|B): .*guanaco-.*(7|13|33|65)b:
mode: 'instruct' mode: 'instruct'
instruction_template: 'Guanaco' instruction_template: 'Guanaco'
.*hypermantis:
mode: 'instruct'
instruction_template: 'Alpaca'
.*open-llama-.*-open-instruct:
mode: 'instruct'
instruction_template: 'Alpaca'
.*starcoder-gpteacher-code-instruct:
mode: 'instruct'
instruction_template: 'Alpaca'
.*tulu:
mode: 'instruct'
instruction_template: 'Tulu'
.*chronos:
mode: 'instruct'
instruction_template: 'Alpaca'
.*samantha:
mode: 'instruct'
instruction_template: 'Samantha'
.*wizardcoder:
mode: 'instruct'
instruction_template: 'Alpaca'
.*starchat-beta:
mode: 'instruct'
instruction_template: 'Starchat-Beta'

82
modules/exllama_hf.py Normal file
View File

@ -0,0 +1,82 @@
import os
import sys
from pathlib import Path
from typing import *
import torch
from transformers import (
GenerationConfig,
LlamaTokenizer,
PretrainedConfig,
PreTrainedModel
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared
from modules.logging_colors import logger
from modules.relative_imports import RelativeImport
with RelativeImport("repositories/exllama"):
from model import ExLlama, ExLlamaCache, ExLlamaConfig
class ExllamaHF(PreTrainedModel):
def __init__(self, config: ExLlamaConfig):
super().__init__(PretrainedConfig())
self.ex_config = config
self.ex_model = ExLlama(self.ex_config)
self.generation_config = GenerationConfig()
def _validate_model_class(self):
pass
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
pass
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return {'input_ids': input_ids, **kwargs}
@property
def device(self) -> torch.device:
# TODO: May cause problem on multi-gpu inference?
return torch.device(0)
def __call__(self, *args, **kwargs):
# TODO: Some decoding methods (such as Contrastive Search) may not work at this time
assert len(args) == 0, 'no *args should be passed to forward'
use_cache = kwargs['use_cache']
seq = kwargs['input_ids'][0].tolist()
cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
if cache is None:
cache = ExLlamaCache(self.ex_model)
self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True)
logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache).to(self.device)
return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
if isinstance(pretrained_model_name_or_path, str):
pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
config = ExLlamaConfig(pretrained_model_name_or_path / 'config.json')
# from 'oobabooga/text-generation-webui/modules/exllama.py'
weight_path = None
for ext in ['.safetensors', '.pt', '.bin']:
found = list(pretrained_model_name_or_path.glob(f"*{ext}"))
if len(found) > 0:
weight_path = found[-1]
break
assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"'
config.model_path = str(weight_path)
# This slowes down a bit but align better with autogptq generation.
# TODO: Should give user choice to tune the exllama config
config.act_order = True
config.fused_attn = False
config.fused_mlp_thd = 0
return ExllamaHF(config)

View File

@ -52,9 +52,9 @@ class LlamaCppModel:
'n_gpu_layers': shared.args.n_gpu_layers 'n_gpu_layers': shared.args.n_gpu_layers
} }
self.model = Llama(**params) result.model = Llama(**params)
if cache_capacity > 0: if cache_capacity > 0:
self.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
# This is ugly, but the model and the tokenizer are the same object in this library. # This is ugly, but the model and the tokenizer are the same object in this library.
return result, result return result, result

View File

@ -55,6 +55,10 @@ loaders_and_params = {
'ExLlama' : [ 'ExLlama' : [
'gpu_split', 'gpu_split',
'exllama_info', 'exllama_info',
],
'ExLlama_HF' : [
'gpu_split',
'exllama_HF_info',
] ]
} }

View File

@ -49,7 +49,8 @@ def load_model(model_name, loader=None):
'llama.cpp': llamacpp_loader, 'llama.cpp': llamacpp_loader,
'FlexGen': flexgen_loader, 'FlexGen': flexgen_loader,
'RWKV': RWKV_loader, 'RWKV': RWKV_loader,
'ExLlama': ExLlama_loader 'ExLlama': ExLlama_loader,
'ExLlama_HF': ExLlama_HF_loader
} }
if loader is None: if loader is None:
@ -278,6 +279,12 @@ def ExLlama_loader(model_name):
return model, tokenizer return model, tokenizer
def ExLlama_HF_loader(model_name):
from modules.exllama_hf import ExllamaHF
return ExllamaHF.from_pretrained(model_name)
def get_max_memory_dict(): def get_max_memory_dict():
max_memory = {} max_memory = {}
if shared.args.gpu_memory: if shared.args.gpu_memory:

View File

@ -98,7 +98,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten
parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
# Model loader # Model loader
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen') parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen')
# Accelerate/transformers # Accelerate/transformers
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
@ -218,6 +218,8 @@ def fix_loader_name(name):
return 'GPTQ-for-LLaMa' return 'GPTQ-for-LLaMa'
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
return 'ExLlama' return 'ExLlama'
elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']:
return 'ExLlama_HF'
if args.loader is not None: if args.loader is not None:

View File

@ -104,9 +104,8 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i
else: else:
new_tokens = len(output_ids) - len(input_ids[0]) new_tokens = len(output_ids) - len(input_ids[0])
reply = decode(output_ids[-new_tokens:], state['skip_special_tokens']) reply = decode(output_ids[-new_tokens:], state['skip_special_tokens'])
# Prevent LlamaTokenizer from skipping a space # Prevent LlamaTokenizer from skipping a space
if type(shared.tokenizer) is transformers.LlamaTokenizer and len(output_ids) > 0: if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0:
if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith(''): if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith(''):
reply = ' ' + reply reply = ' ' + reply

View File

@ -11,7 +11,7 @@ import gradio as gr
import torch import torch
import transformers import transformers
from datasets import Dataset, load_dataset from datasets import Dataset, load_dataset
from peft import (LoraConfig, get_peft_model, prepare_model_for_int8_training, from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training,
set_peft_model_state_dict) set_peft_model_state_dict)
from modules import shared, ui, utils from modules import shared, ui, utils
@ -30,14 +30,17 @@ try:
MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES} MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES}
except: except:
standard_modules = ["q_proj", "v_proj"] standard_modules = ["q_proj", "v_proj"]
model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"]} model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "rw":["query_key_value"]}
MODEL_CLASSES = { MODEL_CLASSES = {
"LlamaForCausalLM": "llama", "LlamaForCausalLM": "llama",
"OPTForCausalLM": "opt", "OPTForCausalLM": "opt",
"GPTJForCausalLM": "gptj", "GPTJForCausalLM": "gptj",
"GPTNeoXForCausalLM": "gpt_neox" "GPTNeoXForCausalLM": "gpt_neox",
"RWForCausalLM": "rw"
} }
train_log = {}
WANT_INTERRUPT = False WANT_INTERRUPT = False
PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after"] PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after"]
@ -357,7 +360,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
# == Start prepping the model itself == # == Start prepping the model itself ==
if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'): if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
logger.info("Getting model ready...") logger.info("Getting model ready...")
prepare_model_for_int8_training(shared.model) prepare_model_for_kbit_training(shared.model)
logger.info("Prepping for training...") logger.info("Prepping for training...")
config = LoraConfig( config = LoraConfig(
@ -406,12 +409,19 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
control.should_training_stop = True control.should_training_stop = True
elif state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0: elif state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0:
lora_model.save_pretrained(f"{lora_file_path}/checkpoint-{tracked.current_steps}/") lora_model.save_pretrained(f"{lora_file_path}/checkpoint-{tracked.current_steps}/")
# Save log
with open(f"{lora_file_path}/checkpoint-{tracked.current_steps}/training_log.json", 'w', encoding='utf-8') as file:
json.dump(train_log, file, indent=2)
def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs): def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
tracked.current_steps += 1 tracked.current_steps += 1
if WANT_INTERRUPT: if WANT_INTERRUPT:
control.should_epoch_stop = True control.should_epoch_stop = True
control.should_training_stop = True control.should_training_stop = True
def on_log(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs, **kwargs):
train_log.update(logs)
trainer = transformers.Trainer( trainer = transformers.Trainer(
model=lora_model, model=lora_model,
@ -448,7 +458,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
# == Save parameters for reuse == # == Save parameters for reuse ==
with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file: with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file:
vars = locals() vars = locals()
json.dump({x: vars[x] for x in PARAMETERS}, file) json.dump({x: vars[x] for x in PARAMETERS}, file, indent=2)
# == Main run and monitor loop == # == Main run and monitor loop ==
logger.info("Starting training...") logger.info("Starting training...")
@ -462,7 +472,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
# Note: save in the thread in case the gradio thread breaks (eg browser closed) # Note: save in the thread in case the gradio thread breaks (eg browser closed)
lora_model.save_pretrained(lora_file_path) lora_model.save_pretrained(lora_file_path)
logger.info("LoRA training run is completed and saved.") logger.info("LoRA training run is completed and saved.")
tracked.did_save = True # Save log
with open(f"{lora_file_path}/training_log.json", 'w', encoding='utf-8') as file:
json.dump(train_log, file, indent=2)
thread = threading.Thread(target=threaded_run) thread = threading.Thread(target=threaded_run)
thread.start() thread.start()

View File

@ -17,10 +17,10 @@ tqdm
scipy scipy
transformers==4.30.2 transformers==4.30.2
git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
bitsandbytes==0.39.0; platform_system != "Windows" bitsandbytes==0.39.1; platform_system != "Windows"
https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl; platform_system == "Windows"
llama-cpp-python==0.1.62; platform_system != "Windows" llama-cpp-python==0.1.64; platform_system != "Windows"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.62/llama_cpp_python-0.1.62-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.64/llama_cpp_python-0.1.64-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/jllllll/exllama/releases/download/0.0.1/exllama-0.0.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/exllama/releases/download/0.0.1/exllama-0.0.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"

View File

@ -122,7 +122,7 @@ def count_tokens(text):
return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?' return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?'
def download_model_wrapper(repo_id): def download_model_wrapper(repo_id, progress=gr.Progress()):
try: try:
downloader_module = importlib.import_module("download-model") downloader_module = importlib.import_module("download-model")
downloader = downloader_module.ModelDownloader() downloader = downloader_module.ModelDownloader()
@ -131,6 +131,7 @@ def download_model_wrapper(repo_id):
branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main" branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
check = False check = False
progress(0.0)
yield ("Cleaning up the model/branch names") yield ("Cleaning up the model/branch names")
model, branch = downloader.sanitize_model_and_branch_names(model, branch) model, branch = downloader.sanitize_model_and_branch_names(model, branch)
@ -141,13 +142,16 @@ def download_model_wrapper(repo_id):
output_folder = downloader.get_output_folder(model, branch, is_lora) output_folder = downloader.get_output_folder(model, branch, is_lora)
if check: if check:
progress(0.5)
yield ("Checking previously downloaded files") yield ("Checking previously downloaded files")
downloader.check_model_files(model, branch, links, sha256, output_folder) downloader.check_model_files(model, branch, links, sha256, output_folder)
progress(1.0)
else: else:
yield (f"Downloading files to {output_folder}") yield (f"Downloading files to {output_folder}")
downloader.download_model_files(model, branch, links, sha256, output_folder, threads=1) downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1)
yield ("Done!") yield ("Done!")
except: except:
progress(1.0)
yield traceback.format_exc() yield traceback.format_exc()
@ -193,7 +197,7 @@ def create_model_menus():
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "llama.cpp"], value=None) shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "ExLlama_HF", "llama.cpp"], value=None)
with gr.Box(): with gr.Box():
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
@ -233,6 +237,7 @@ def create_model_menus():
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info'] = gr.Markdown('ExLlama has to be installed manually. See the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).') shared.gradio['exllama_info'] = gr.Markdown('ExLlama has to be installed manually. See the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s still a bit buggy, so feel free to help out by fixing issues.\n\nCheck out PR [#2777](https://github.com/oobabooga/text-generation-webui/pull/2777) for more details.')
with gr.Column(): with gr.Column():
with gr.Row(): with gr.Row():
@ -276,7 +281,7 @@ def create_model_menus():
save_model_settings, [shared.gradio[k] for k in ['model_menu', 'interface_state']], shared.gradio['model_status'], show_progress=False) save_model_settings, [shared.gradio[k] for k in ['model_menu', 'interface_state']], shared.gradio['model_status'], show_progress=False)
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, shared.gradio['lora_menu'], shared.gradio['model_status'], show_progress=False) shared.gradio['lora_menu_apply'].click(load_lora_wrapper, shared.gradio['lora_menu'], shared.gradio['model_status'], show_progress=False)
shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=False) shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=True)
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), shared.gradio['autoload_model'], load) shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), shared.gradio['autoload_model'], load)