diff --git a/README.md b/README.md index f6b1d4f5..97f26ccb 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,32 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Installation -The recommended installation methods are the following: +### One-click installers -* Linux and MacOS: using conda natively. -* Windows: using conda on WSL ([WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide)). +[oobabooga-windows.zip](https://github.com/oobabooga/text-generation-webui/releases/download/installers/oobabooga-windows.zip) + +Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder. + +* To download a model, double click on "download-model" +* To start the web UI, double click on "start-webui" + +Source codes: https://github.com/oobabooga/one-click-installers + +> **Note** +> +> Thanks to [@jllllll](https://github.com/jllllll) and [@ClayShoaf](https://github.com/ClayShoaf), the Windows 1-click installer now sets up 8-bit and 4-bit requirements out of the box. No additional installation steps are necessary. + +> **Note** +> +> There is no need to run the installer as admin. + +### Manual installation using Conda + +Recommended if you have some experience with the command-line. + +On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide). + +#### 0. Install Conda Conda can be downloaded here: https://docs.conda.io/en/latest/miniconda.html @@ -84,26 +106,10 @@ pip install -r requirements.txt > > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859 -### Alternative: one-click installers -[oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip) +### Alternative: manual Windows installation -[oobabooga-linux.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-linux.zip) - -Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder. - -* To download a model, double click on "download-model" -* To start the web UI, double click on "start-webui" - -Source codes: https://github.com/oobabooga/one-click-installers - -> **Note** -> -> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid). - -### Alternative: native Windows installation - -As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings). +As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-installation-guide). ### Alternative: Docker @@ -177,7 +183,7 @@ Optionally, you can use the following command-line flags: | `--cpu` | Use the CPU to generate text.| | `--load-in-8bit` | Load the model with 8-bit precision.| | `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | -| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. | +| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | | `--groupsize GROUPSIZE` | GPTQ: Group size. | | `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | diff --git a/download-model.py b/download-model.py index a2d3a6d6..f67055ba 100644 --- a/download-model.py +++ b/download-model.py @@ -8,6 +8,7 @@ python download-model.py facebook/opt-1.3b import argparse import base64 +import datetime import json import re import sys @@ -17,6 +18,14 @@ import requests import tqdm from tqdm.contrib.concurrent import thread_map +parser = argparse.ArgumentParser() +parser.add_argument('MODEL', type=str, default=None, nargs='?') +parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.') +parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') +parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') +parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') +args = parser.parse_args() + def get_file(url, output_folder): r = requests.get(url, stream=True) with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f: @@ -165,13 +174,24 @@ if __name__ == '__main__': sys.exit() links, is_lora = get_download_links_from_huggingface(model, branch) - base_folder = 'models' if not is_lora else 'loras' - if branch != 'main': - output_folder = Path(base_folder) / (model.split('/')[-1] + f'_{branch}') + + if args.output is not None: + base_folder = args.output else: - output_folder = Path(base_folder) / model.split('/')[-1] + base_folder = 'models' if not is_lora else 'loras' + + output_folder = f"{'_'.join(model.split('/')[-2:])}" + if branch != 'main': + output_folder += f'_{branch}' + + # Creating the folder and writing the metadata + output_folder = Path(base_folder) / output_folder if not output_folder.exists(): output_folder.mkdir() + with open(output_folder / 'huggingface-metadata.txt', 'w') as f: + f.write(f'url: https://huggingface.co/{model}\n') + f.write(f'branch: {branch}\n') + f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') # Downloading the files print(f"Downloading the model to {output_folder}") diff --git a/loras/place-your-loras-here.txt b/loras/place-your-loras-here.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index afb5695f..7926d0ab 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -4,15 +4,50 @@ from pathlib import Path import accelerate import torch +import transformers +from transformers import AutoConfig, AutoModelForCausalLM import modules.shared as shared sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) -import llama import llama_inference_offload -import opt +from modelutils import find_layers +from quant import make_quant +def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128): + config = AutoConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = AutoModelForCausalLM.from_config(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in exclude_layers: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize, faster=faster_kernel, kernel_switch_threshold=kernel_switch_threshold) + + del layers + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done.') + + return model + def load_quantized(model_name): if not shared.args.model_type: # Try to determine model type from model name @@ -20,6 +55,8 @@ def load_quantized(model_name): model_type = 'llama' elif model_name.lower().startswith(('opt', 'galactica')): model_type = 'opt' + elif model_name.lower().startswith(('gpt-j', 'pygmalion-6b')): + model_type = 'gptj' else: print("Can't determine model type from model name. Please specify it manually using --model_type " "argument") @@ -27,15 +64,12 @@ def load_quantized(model_name): else: model_type = shared.args.model_type.lower() - if model_type == 'llama': - if not shared.args.pre_layer: - load_quant = llama.load_quant - else: - load_quant = llama_inference_offload.load_quant - elif model_type == 'opt': - load_quant = opt.load_quant + if model_type == 'llama' and shared.args.pre_layer: + load_quant = llama_inference_offload.load_quant + elif model_type in ('llama', 'opt', 'gptj'): + load_quant = _load_quant else: - print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported") + print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported") exit() # Now we are going to try to locate the quantized model file. @@ -75,7 +109,8 @@ def load_quantized(model_name): if shared.args.pre_layer: model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer) else: - model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize) + threshold = False if model_type == 'gptj' else 128 + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold) # accelerate offload (doesn't work properly) if shared.args.gpu_memory: diff --git a/modules/callbacks.py b/modules/callbacks.py index d85f406d..aa92f9cb 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -1,4 +1,5 @@ import gc +import traceback from queue import Queue from threading import Thread @@ -63,6 +64,10 @@ class Iteratorize: ret = self.mfunc(callback=_callback, **self.kwargs) except ValueError: pass + except: + traceback.print_exc() + pass + clear_torch_cache() self.q.put(self.sentinel) if self.c_callback: diff --git a/modules/extensions.py b/modules/extensions.py index c3cf4de4..fe6a3945 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -7,7 +7,7 @@ import modules.shared as shared state = {} available_extensions = [] -setup_called = False +setup_called = set() def load_extensions(): global state @@ -53,13 +53,12 @@ def create_extensions_block(): should_display_ui = False # Running setup function - if not setup_called: - for extension, name in iterator(): - if hasattr(extension, "setup"): - extension.setup() - if hasattr(extension, "ui"): - should_display_ui = True - setup_called = True + for extension, name in iterator(): + if hasattr(extension, "ui"): + should_display_ui = True + if extension not in setup_called and hasattr(extension, "setup"): + setup_called.add(extension) + extension.setup() # Creating the extension ui elements if should_display_ui: diff --git a/modules/shared.py b/modules/shared.py index ac9d750c..5d1b42d4 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -84,7 +84,7 @@ parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use -- parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.') parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.') parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') -parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.') +parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.') diff --git a/modules/training.py b/modules/training.py index 7bcecb38..62ba181c 100644 --- a/modules/training.py +++ b/modules/training.py @@ -2,6 +2,7 @@ import json import sys import threading import time +import traceback from pathlib import Path import gradio as gr @@ -119,7 +120,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int } # == Prep the dataset, format, etc == - if raw_text_file is not None: + if raw_text_file not in ['None', '']: print("Loading raw text file dataset...") with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file: raw_text = file.read() @@ -136,16 +137,17 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int del text_chunks else: - with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: - format_data: dict[str, str] = json.load(formatFile) - - if dataset is None: + if dataset in ['None', '']: yield "**Missing dataset choice input, cannot continue.**" return - if format is None: + + if format in ['None', '']: yield "**Missing format choice input, cannot continue.**" return + with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: + format_data: dict[str, str] = json.load(formatFile) + def generate_prompt(data_point: dict[str, str]): for options, data in format_data.items(): if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0): @@ -183,7 +185,13 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int bias="none", task_type="CAUSAL_LM" ) - lora_model = get_peft_model(shared.model, config) + + try: + lora_model = get_peft_model(shared.model, config) + except: + yield traceback.format_exc() + return + trainer = transformers.Trainer( model=lora_model, train_dataset=train_data,