From c8207d474f9c5365ab5a1c269eb71bff05a31988 Mon Sep 17 00:00:00 2001 From: Maya Eary Date: Tue, 28 Mar 2023 20:38:55 +0300 Subject: [PATCH 01/14] Generalized load_quantized --- modules/GPTQ_loader.py | 54 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index afb5695f..351d658d 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -4,13 +4,48 @@ from pathlib import Path import accelerate import torch +import transformers +from transformers import AutoConfig, AutoModelForCausalLM import modules.shared as shared sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) -import llama import llama_inference_offload -import opt +from quant import make_quant +from modelutils import find_layers + +def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head']): + config = AutoConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = AutoModelForCausalLM.from_config(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in exclude_layers: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize, faster=faster_kernel) + + del layers + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done.') + + return model def load_quantized(model_name): @@ -20,6 +55,8 @@ def load_quantized(model_name): model_type = 'llama' elif model_name.lower().startswith(('opt', 'galactica')): model_type = 'opt' + elif model_name.lower().startswith(('gpt-j', 'pygmalion-6b')): + model_type = 'gptj' else: print("Can't determine model type from model name. Please specify it manually using --model_type " "argument") @@ -27,15 +64,12 @@ def load_quantized(model_name): else: model_type = shared.args.model_type.lower() - if model_type == 'llama': - if not shared.args.pre_layer: - load_quant = llama.load_quant - else: - load_quant = llama_inference_offload.load_quant - elif model_type == 'opt': - load_quant = opt.load_quant + if model_type == 'llama' and shared.args.pre_layer: + oad_quant = llama_inference_offload.load_quant + elif model_type in ('llama', 'opt', 'gptj'): + load_quant = _load_quant else: - print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported") + print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported") exit() # Now we are going to try to locate the quantized model file. From 1c075d8d219b5fd2bfeba1b4bad8f912b22a26da Mon Sep 17 00:00:00 2001 From: Maya Eary Date: Tue, 28 Mar 2023 20:43:50 +0300 Subject: [PATCH 02/14] Fix typo --- modules/GPTQ_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 351d658d..1fdd23c0 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -65,7 +65,7 @@ def load_quantized(model_name): model_type = shared.args.model_type.lower() if model_type == 'llama' and shared.args.pre_layer: - oad_quant = llama_inference_offload.load_quant + load_quant = llama_inference_offload.load_quant elif model_type in ('llama', 'opt', 'gptj'): load_quant = _load_quant else: From 41ec682834de3e7b79cd8e27aeec98690bc209ac Mon Sep 17 00:00:00 2001 From: Maya Eary Date: Tue, 28 Mar 2023 22:45:38 +0300 Subject: [PATCH 03/14] Disable kernel threshold for gpt-j --- modules/GPTQ_loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 1fdd23c0..2a9039a3 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -14,7 +14,7 @@ import llama_inference_offload from quant import make_quant from modelutils import find_layers -def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head']): +def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128): config = AutoConfig.from_pretrained(model) def noop(*args, **kwargs): pass @@ -32,7 +32,7 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc for name in exclude_layers: if name in layers: del layers[name] - make_quant(model, layers, wbits, groupsize, faster=faster_kernel) + make_quant(model, layers, wbits, groupsize, faster=faster_kernel, kernel_switch_threshold=kernel_switch_threshold) del layers @@ -109,7 +109,8 @@ def load_quantized(model_name): if shared.args.pre_layer: model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer) else: - model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize) + threshold = False if model_type == 'gptj' else 128 + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold) # accelerate offload (doesn't work properly) if shared.args.gpu_memory: From 0bec15ebcd1571155a54e87b371dc40534864f2e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Mar 2023 17:34:15 -0300 Subject: [PATCH 04/14] Reorder imports --- modules/GPTQ_loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 2a9039a3..c99a63f3 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -5,14 +5,15 @@ from pathlib import Path import accelerate import torch import transformers -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM import modules.shared as shared sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) import llama_inference_offload -from quant import make_quant from modelutils import find_layers +from quant import make_quant + def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128): config = AutoConfig.from_pretrained(model) From 010b259dde859b5703a6ea4cf2ea6c0aa4f25343 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Mar 2023 17:46:00 -0300 Subject: [PATCH 05/14] Update documentation --- README.md | 2 +- modules/GPTQ_loader.py | 1 - modules/shared.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f6b1d4f5..ba386852 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,7 @@ Optionally, you can use the following command-line flags: | `--cpu` | Use the CPU to generate text.| | `--load-in-8bit` | Load the model with 8-bit precision.| | `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | -| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. | +| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | | `--groupsize GROUPSIZE` | GPTQ: Group size. | | `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index c99a63f3..7926d0ab 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -48,7 +48,6 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc return model - def load_quantized(model_name): if not shared.args.model_type: # Try to determine model type from model name diff --git a/modules/shared.py b/modules/shared.py index ac9d750c..5d1b42d4 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -84,7 +84,7 @@ parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use -- parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.') parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.') parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') -parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.') +parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.') From 304f812c637f5494e6c42d296040f0506d9194a1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Mar 2023 19:20:50 -0300 Subject: [PATCH 06/14] Gracefully handle CUDA out of memory errors with streaming --- modules/callbacks.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/callbacks.py b/modules/callbacks.py index d85f406d..aa92f9cb 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -1,4 +1,5 @@ import gc +import traceback from queue import Queue from threading import Thread @@ -63,6 +64,10 @@ class Iteratorize: ret = self.mfunc(callback=_callback, **self.kwargs) except ValueError: pass + except: + traceback.print_exc() + pass + clear_torch_cache() self.q.put(self.sentinel) if self.c_callback: From 1edfb9677840b03ce321a450aed87961af24a361 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:27:02 -0300 Subject: [PATCH 07/14] Fix loading extensions from within the interface --- modules/extensions.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/extensions.py b/modules/extensions.py index c3cf4de4..fe6a3945 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -7,7 +7,7 @@ import modules.shared as shared state = {} available_extensions = [] -setup_called = False +setup_called = set() def load_extensions(): global state @@ -53,13 +53,12 @@ def create_extensions_block(): should_display_ui = False # Running setup function - if not setup_called: - for extension, name in iterator(): - if hasattr(extension, "setup"): - extension.setup() - if hasattr(extension, "ui"): - should_display_ui = True - setup_called = True + for extension, name in iterator(): + if hasattr(extension, "ui"): + should_display_ui = True + if extension not in setup_called and hasattr(extension, "setup"): + setup_called.add(extension) + extension.setup() # Creating the extension ui elements if should_display_ui: From c2a863f87deee8b9a314e3c58d93b6b2703cf0d9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 02:11:51 -0300 Subject: [PATCH 08/14] Mention the updated one-click installer --- README.md | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index ba386852..241d0e03 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,28 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Installation -The recommended installation methods are the following: +### One-click installers + +[oobabooga-windows.zip](https://github.com/oobabooga/text-generation-webui/releases/download/installers/oobabooga-windows.zip) + +Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder. + +* To download a model, double click on "download-model" +* To start the web UI, double click on "start-webui" + +Source codes: https://github.com/oobabooga/one-click-installers + +> **Note** +> +> Thanks to [@jllllll](https://github.com/jllllll) and [@ClayShoaf](https://github.com/ClayShoaf), the Windows 1-click installer now sets up 8-bit and 4-bit requirements out of the box. No additional installation steps are necessary. + +> **Note** +> +> There is no need to run the installer as admin. + +### Manual installation using Conda + +These are the recommended installation methods: * Linux and MacOS: using conda natively. * Windows: using conda on WSL ([WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide)). @@ -84,24 +105,8 @@ pip install -r requirements.txt > > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859 -### Alternative: one-click installers -[oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip) - -[oobabooga-linux.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-linux.zip) - -Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder. - -* To download a model, double click on "download-model" -* To start the web UI, double click on "start-webui" - -Source codes: https://github.com/oobabooga/one-click-installers - -> **Note** -> -> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid). - -### Alternative: native Windows installation +### Alternative: manual Windows installation As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings). From 5d0b83c341804bcdffe73d8876468012a2edc78b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 02:22:19 -0300 Subject: [PATCH 09/14] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 241d0e03..965c9d15 100644 --- a/README.md +++ b/README.md @@ -57,10 +57,9 @@ Source codes: https://github.com/oobabooga/one-click-installers ### Manual installation using Conda -These are the recommended installation methods: +Recommended if you have some experience with the command-line. -* Linux and MacOS: using conda natively. -* Windows: using conda on WSL ([WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide)). +On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide). Conda can be downloaded here: https://docs.conda.io/en/latest/miniconda.html From 3b4447a4fe2ef7c99322a626b750ea1aa43083e8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 02:24:11 -0300 Subject: [PATCH 10/14] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 965c9d15..87367877 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ Recommended if you have some experience with the command-line. On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide). +#### 0. Install Conda + Conda can be downloaded here: https://docs.conda.io/en/latest/miniconda.html On Linux or WSL, it can be automatically installed with these two commands: From 41b58bc47e84458b880386e57d0d17e2bfe6f76c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:02:29 -0300 Subject: [PATCH 11/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 87367877..97f26ccb 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ pip install -r requirements.txt ### Alternative: manual Windows installation -As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings). +As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-installation-guide). ### Alternative: Docker From a6d03730639463eb261b40ec5dad380f5df791ed Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:48:17 -0300 Subject: [PATCH 12/14] Fix training dataset loading #636 --- modules/training.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/training.py b/modules/training.py index 7bcecb38..913866d9 100644 --- a/modules/training.py +++ b/modules/training.py @@ -119,7 +119,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int } # == Prep the dataset, format, etc == - if raw_text_file is not None: + if raw_text_file not in ['None', '']: print("Loading raw text file dataset...") with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file: raw_text = file.read() @@ -136,16 +136,17 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int del text_chunks else: - with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: - format_data: dict[str, str] = json.load(formatFile) - - if dataset is None: + if dataset in ['None', '']: yield "**Missing dataset choice input, cannot continue.**" return - if format is None: + + if format in ['None', '']: yield "**Missing format choice input, cannot continue.**" return + with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: + format_data: dict[str, str] = json.load(formatFile) + def generate_prompt(data_point: dict[str, str]): for options, data in format_data.items(): if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0): From 58349f44a0924671e65de7cb42764fb846653afe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:55:34 -0300 Subject: [PATCH 13/14] Handle training exception for unsupported models --- modules/training.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/training.py b/modules/training.py index 913866d9..62ba181c 100644 --- a/modules/training.py +++ b/modules/training.py @@ -2,6 +2,7 @@ import json import sys import threading import time +import traceback from pathlib import Path import gradio as gr @@ -184,7 +185,13 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int bias="none", task_type="CAUSAL_LM" ) - lora_model = get_peft_model(shared.model, config) + + try: + lora_model = get_peft_model(shared.model, config) + except: + yield traceback.format_exc() + return + trainer = transformers.Trainer( model=lora_model, train_dataset=train_data, From 1445ea86f7c2a0c8e3f88337ab15d4e076accc70 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 29 Mar 2023 20:26:44 -0300 Subject: [PATCH 14/14] Add --output and better metadata for downloading models --- download-model.py | 21 +++++++++++++++++---- loras/place-your-loras-here.txt | 0 2 files changed, 17 insertions(+), 4 deletions(-) delete mode 100644 loras/place-your-loras-here.txt diff --git a/download-model.py b/download-model.py index dce7e749..05d9dca4 100644 --- a/download-model.py +++ b/download-model.py @@ -8,6 +8,7 @@ python download-model.py facebook/opt-1.3b import argparse import base64 +import datetime import json import multiprocessing import re @@ -22,6 +23,7 @@ parser.add_argument('MODEL', type=str, default=None, nargs='?') parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.') parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') +parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') args = parser.parse_args() def get_file(args): @@ -169,13 +171,24 @@ if __name__ == '__main__': sys.exit() links, is_lora = get_download_links_from_huggingface(model, branch) - base_folder = 'models' if not is_lora else 'loras' - if branch != 'main': - output_folder = Path(base_folder) / (model.split('/')[-1] + f'_{branch}') + + if args.output is not None: + base_folder = args.output else: - output_folder = Path(base_folder) / model.split('/')[-1] + base_folder = 'models' if not is_lora else 'loras' + + output_folder = f"{'_'.join(model.split('/')[-2:])}" + if branch != 'main': + output_folder += f'_{branch}' + + # Creating the folder and writing the metadata + output_folder = Path(base_folder) / output_folder if not output_folder.exists(): output_folder.mkdir() + with open(output_folder / 'huggingface-metadata.txt', 'w') as f: + f.write(f'url: https://huggingface.co/{model}\n') + f.write(f'branch: {branch}\n') + f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') # Downloading the files print(f"Downloading the model to {output_folder}") diff --git a/loras/place-your-loras-here.txt b/loras/place-your-loras-here.txt deleted file mode 100644 index e69de29b..00000000