Merge branch 'main' into nikita-skakun-optimize-download-model

This commit is contained in:
oobabooga 2023-03-29 20:45:33 -03:00
commit 6403e72062
8 changed files with 126 additions and 53 deletions

View File

@ -36,10 +36,32 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## Installation ## Installation
The recommended installation methods are the following: ### One-click installers
* Linux and MacOS: using conda natively. [oobabooga-windows.zip](https://github.com/oobabooga/text-generation-webui/releases/download/installers/oobabooga-windows.zip)
* Windows: using conda on WSL ([WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide)).
Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder.
* To download a model, double click on "download-model"
* To start the web UI, double click on "start-webui"
Source codes: https://github.com/oobabooga/one-click-installers
> **Note**
>
> Thanks to [@jllllll](https://github.com/jllllll) and [@ClayShoaf](https://github.com/ClayShoaf), the Windows 1-click installer now sets up 8-bit and 4-bit requirements out of the box. No additional installation steps are necessary.
> **Note**
>
> There is no need to run the installer as admin.
### Manual installation using Conda
Recommended if you have some experience with the command-line.
On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-Subsystem-for-Linux-(Ubuntu)-Installation-Guide).
#### 0. Install Conda
Conda can be downloaded here: https://docs.conda.io/en/latest/miniconda.html Conda can be downloaded here: https://docs.conda.io/en/latest/miniconda.html
@ -84,26 +106,10 @@ pip install -r requirements.txt
> >
> For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859 > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859
### Alternative: one-click installers
[oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip) ### Alternative: manual Windows installation
[oobabooga-linux.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-linux.zip) As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-installation-guide).
Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder.
* To download a model, double click on "download-model"
* To start the web UI, double click on "start-webui"
Source codes: https://github.com/oobabooga/one-click-installers
> **Note**
>
> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid).
### Alternative: native Windows installation
As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings).
### Alternative: Docker ### Alternative: Docker
@ -177,7 +183,7 @@ Optionally, you can use the following command-line flags:
| `--cpu` | Use the CPU to generate text.| | `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.| | `--load-in-8bit` | Load the model with 8-bit precision.|
| `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | | `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. | | `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
| `--groupsize GROUPSIZE` | GPTQ: Group size. | | `--groupsize GROUPSIZE` | GPTQ: Group size. |
| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. | | `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |

View File

@ -8,6 +8,7 @@ python download-model.py facebook/opt-1.3b
import argparse import argparse
import base64 import base64
import datetime
import json import json
import re import re
import sys import sys
@ -17,6 +18,14 @@ import requests
import tqdm import tqdm
from tqdm.contrib.concurrent import thread_map from tqdm.contrib.concurrent import thread_map
parser = argparse.ArgumentParser()
parser.add_argument('MODEL', type=str, default=None, nargs='?')
parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
args = parser.parse_args()
def get_file(url, output_folder): def get_file(url, output_folder):
r = requests.get(url, stream=True) r = requests.get(url, stream=True)
with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f: with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f:
@ -165,13 +174,24 @@ if __name__ == '__main__':
sys.exit() sys.exit()
links, is_lora = get_download_links_from_huggingface(model, branch) links, is_lora = get_download_links_from_huggingface(model, branch)
base_folder = 'models' if not is_lora else 'loras'
if branch != 'main': if args.output is not None:
output_folder = Path(base_folder) / (model.split('/')[-1] + f'_{branch}') base_folder = args.output
else: else:
output_folder = Path(base_folder) / model.split('/')[-1] base_folder = 'models' if not is_lora else 'loras'
output_folder = f"{'_'.join(model.split('/')[-2:])}"
if branch != 'main':
output_folder += f'_{branch}'
# Creating the folder and writing the metadata
output_folder = Path(base_folder) / output_folder
if not output_folder.exists(): if not output_folder.exists():
output_folder.mkdir() output_folder.mkdir()
with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
f.write(f'url: https://huggingface.co/{model}\n')
f.write(f'branch: {branch}\n')
f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
# Downloading the files # Downloading the files
print(f"Downloading the model to {output_folder}") print(f"Downloading the model to {output_folder}")

View File

@ -4,15 +4,50 @@ from pathlib import Path
import accelerate import accelerate
import torch import torch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM
import modules.shared as shared import modules.shared as shared
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
import llama
import llama_inference_offload import llama_inference_offload
import opt from modelutils import find_layers
from quant import make_quant
def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
config = AutoConfig.from_pretrained(model)
def noop(*args, **kwargs):
pass
torch.nn.init.kaiming_uniform_ = noop
torch.nn.init.uniform_ = noop
torch.nn.init.normal_ = noop
torch.set_default_dtype(torch.half)
transformers.modeling_utils._init_weights = False
torch.set_default_dtype(torch.half)
model = AutoModelForCausalLM.from_config(config)
torch.set_default_dtype(torch.float)
model = model.eval()
layers = find_layers(model)
for name in exclude_layers:
if name in layers:
del layers[name]
make_quant(model, layers, wbits, groupsize, faster=faster_kernel, kernel_switch_threshold=kernel_switch_threshold)
del layers
print('Loading model ...')
if checkpoint.endswith('.safetensors'):
from safetensors.torch import load_file as safe_load
model.load_state_dict(safe_load(checkpoint))
else:
model.load_state_dict(torch.load(checkpoint))
model.seqlen = 2048
print('Done.')
return model
def load_quantized(model_name): def load_quantized(model_name):
if not shared.args.model_type: if not shared.args.model_type:
# Try to determine model type from model name # Try to determine model type from model name
@ -20,6 +55,8 @@ def load_quantized(model_name):
model_type = 'llama' model_type = 'llama'
elif model_name.lower().startswith(('opt', 'galactica')): elif model_name.lower().startswith(('opt', 'galactica')):
model_type = 'opt' model_type = 'opt'
elif model_name.lower().startswith(('gpt-j', 'pygmalion-6b')):
model_type = 'gptj'
else: else:
print("Can't determine model type from model name. Please specify it manually using --model_type " print("Can't determine model type from model name. Please specify it manually using --model_type "
"argument") "argument")
@ -27,15 +64,12 @@ def load_quantized(model_name):
else: else:
model_type = shared.args.model_type.lower() model_type = shared.args.model_type.lower()
if model_type == 'llama': if model_type == 'llama' and shared.args.pre_layer:
if not shared.args.pre_layer: load_quant = llama_inference_offload.load_quant
load_quant = llama.load_quant elif model_type in ('llama', 'opt', 'gptj'):
else: load_quant = _load_quant
load_quant = llama_inference_offload.load_quant
elif model_type == 'opt':
load_quant = opt.load_quant
else: else:
print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported") print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
exit() exit()
# Now we are going to try to locate the quantized model file. # Now we are going to try to locate the quantized model file.
@ -75,7 +109,8 @@ def load_quantized(model_name):
if shared.args.pre_layer: if shared.args.pre_layer:
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer) model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)
else: else:
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize) threshold = False if model_type == 'gptj' else 128
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
# accelerate offload (doesn't work properly) # accelerate offload (doesn't work properly)
if shared.args.gpu_memory: if shared.args.gpu_memory:

View File

@ -1,4 +1,5 @@
import gc import gc
import traceback
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
@ -63,6 +64,10 @@ class Iteratorize:
ret = self.mfunc(callback=_callback, **self.kwargs) ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError: except ValueError:
pass pass
except:
traceback.print_exc()
pass
clear_torch_cache() clear_torch_cache()
self.q.put(self.sentinel) self.q.put(self.sentinel)
if self.c_callback: if self.c_callback:

View File

@ -7,7 +7,7 @@ import modules.shared as shared
state = {} state = {}
available_extensions = [] available_extensions = []
setup_called = False setup_called = set()
def load_extensions(): def load_extensions():
global state global state
@ -53,13 +53,12 @@ def create_extensions_block():
should_display_ui = False should_display_ui = False
# Running setup function # Running setup function
if not setup_called: for extension, name in iterator():
for extension, name in iterator(): if hasattr(extension, "ui"):
if hasattr(extension, "setup"): should_display_ui = True
extension.setup() if extension not in setup_called and hasattr(extension, "setup"):
if hasattr(extension, "ui"): setup_called.add(extension)
should_display_ui = True extension.setup()
setup_called = True
# Creating the extension ui elements # Creating the extension ui elements
if should_display_ui: if should_display_ui:

View File

@ -84,7 +84,7 @@ parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --
parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.') parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.') parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.') parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.') parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.')

View File

@ -2,6 +2,7 @@ import json
import sys import sys
import threading import threading
import time import time
import traceback
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
@ -119,7 +120,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
} }
# == Prep the dataset, format, etc == # == Prep the dataset, format, etc ==
if raw_text_file is not None: if raw_text_file not in ['None', '']:
print("Loading raw text file dataset...") print("Loading raw text file dataset...")
with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file: with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file:
raw_text = file.read() raw_text = file.read()
@ -136,16 +137,17 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
del text_chunks del text_chunks
else: else:
with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: if dataset in ['None', '']:
format_data: dict[str, str] = json.load(formatFile)
if dataset is None:
yield "**Missing dataset choice input, cannot continue.**" yield "**Missing dataset choice input, cannot continue.**"
return return
if format is None:
if format in ['None', '']:
yield "**Missing format choice input, cannot continue.**" yield "**Missing format choice input, cannot continue.**"
return return
with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile:
format_data: dict[str, str] = json.load(formatFile)
def generate_prompt(data_point: dict[str, str]): def generate_prompt(data_point: dict[str, str]):
for options, data in format_data.items(): for options, data in format_data.items():
if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0): if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0):
@ -183,7 +185,13 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
bias="none", bias="none",
task_type="CAUSAL_LM" task_type="CAUSAL_LM"
) )
lora_model = get_peft_model(shared.model, config)
try:
lora_model = get_peft_model(shared.model, config)
except:
yield traceback.format_exc()
return
trainer = transformers.Trainer( trainer = transformers.Trainer(
model=lora_model, model=lora_model,
train_dataset=train_data, train_dataset=train_data,