From ff0d0ac552ef69c3bae6bf34397706033d6e2fff Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 20 Apr 2023 13:26:58 -0300 Subject: [PATCH 01/33] Api extension bug fix --- extensions/api/script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extensions/api/script.py b/extensions/api/script.py index ab004542..5e429130 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -59,6 +59,7 @@ class Handler(BaseHTTPRequestHandler): 'truncation_length': int(body.get('truncation_length', 2048)), 'ban_eos_token': bool(body.get('ban_eos_token', False)), 'skip_special_tokens': bool(body.get('skip_special_tokens', True)), + 'custom_stopping_strings': '', # leave this blank 'stopping_strings': body.get('stopping_strings', []), } stopping_strings = generate_params.pop('stopping_strings') From c4f4f413897ff41266e66c35f253405ecafccbfb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 00:20:33 -0300 Subject: [PATCH 02/33] Add an "Evaluate" tab to calculate the perplexities of models (#1322) --- modules/evaluate.py | 140 ++++++++++++++++++++++++++++++++++++++++++++ modules/models.py | 14 ++--- modules/training.py | 65 ++++++++++++++++---- modules/ui.py | 3 +- requirements.txt | 3 +- 5 files changed, 203 insertions(+), 22 deletions(-) create mode 100644 modules/evaluate.py diff --git a/modules/evaluate.py b/modules/evaluate.py new file mode 100644 index 00000000..9822ddea --- /dev/null +++ b/modules/evaluate.py @@ -0,0 +1,140 @@ +import datetime +import traceback +from pathlib import Path + +import pandas as pd +import torch +from datasets import load_dataset +from tqdm import tqdm + +from modules import shared +from modules.models import load_model, unload_model +from modules.text_generation import encode +from server import get_model_specific_settings, update_model_parameters + + +def load_past_evaluations(): + if Path('logs/evaluations.csv').exists(): + df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str) + df['Perplexity'] = pd.to_numeric(df['Perplexity']) + return df + else: + return pd.DataFrame(columns=['Model', 'LoRAs', 'Dataset', 'Perplexity', 'stride', 'max_length', 'Date', 'Comment']) +past_evaluations = load_past_evaluations() + + +def save_past_evaluations(df): + df.to_csv(Path('logs/evaluations.csv'), index=False) + + +def calculate_perplexity(models, input_dataset, stride, _max_length): + ''' + Based on: + https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models + ''' + + global past_evaluations + cumulative_log = '' + cumulative_log += "Loading the input dataset...\n" + yield cumulative_log + + # Copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/utils/datautils.py + if input_dataset == 'wikitext': + data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + text = "\n\n".join(data['text']) + elif input_dataset == 'ptb': + data = load_dataset('ptb_text_only', 'penn_treebank', split='validation') + text = "\n\n".join(data['sentence']) + elif input_dataset == 'ptb_new': + data = load_dataset('ptb_text_only', 'penn_treebank', split='test') + text = " ".join(data['sentence']) + else: + with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f: + text = f.read() + + for model in models: + if is_in_past_evaluations(model, input_dataset, stride, _max_length): + cumulative_log += f"{model} has already been tested. Ignoring.\n" + yield cumulative_log + continue + + if model != 'current model': + try: + yield cumulative_log + f"Loading {model}...\n" + model_settings = get_model_specific_settings(model) + shared.settings.update(model_settings) # hijacking the interface defaults + update_model_parameters(model_settings) # hijacking the command-line arguments + shared.model_name = model + unload_model() + shared.model, shared.tokenizer = load_model(shared.model_name) + except: + cumulative_log += f"Failed to load {model}. Moving on.\n" + yield cumulative_log + continue + + cumulative_log += f"Processing {model}...\n" + yield cumulative_log + "Tokenizing the input dataset...\n" + encodings = encode(text, add_special_tokens=False) + seq_len = encodings.shape[1] + max_length = _max_length or shared.model.config.max_position_embeddings + nlls = [] + prev_end_loc = 0 + for begin_loc in tqdm(range(0, seq_len, stride)): + yield cumulative_log + f"Evaluating... {100*begin_loc/seq_len:.2f}%" + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings[:, begin_loc:end_loc] + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = shared.model(input_ids, labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + ppl = torch.exp(torch.stack(nlls).mean()) + add_entry_to_past_evaluations(float(ppl), shared.model_name, input_dataset, stride, _max_length) + save_past_evaluations(past_evaluations) + cumulative_log += f"Done. The perplexity is: {float(ppl)}\n\n" + yield cumulative_log + + +def add_entry_to_past_evaluations(perplexity, model, dataset, stride, max_length): + global past_evaluations + entry = { + 'Model': model, + 'LoRAs': ', '.join(shared.lora_names) or '-', + 'Dataset': dataset, + 'Perplexity': perplexity, + 'stride': str(stride), + 'max_length': str(max_length), + 'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'Comment': '' + } + past_evaluations = pd.concat([past_evaluations, pd.DataFrame([entry])], ignore_index=True) + + +def is_in_past_evaluations(model, dataset, stride, max_length): + entries = past_evaluations[(past_evaluations['Model'] == model) & + (past_evaluations['Dataset'] == dataset) & + (past_evaluations['max_length'] == str(max_length)) & + (past_evaluations['stride'] == str(stride))] + + if entries.shape[0] > 0: + return True + else: + return False + + +def generate_markdown_table(): + sorted_df = past_evaluations.sort_values(by=['Dataset', 'stride', 'Perplexity', 'Date']) + return sorted_df diff --git a/modules/models.py b/modules/models.py index d639ca65..800d0be2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -53,7 +53,7 @@ def load_model(model_name): # Load the model in simple 16-bit mode by default if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): - model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code) + model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code) if torch.has_mps: device = torch.device('mps') model = model.to(device) @@ -81,11 +81,11 @@ def load_model(model_name): num_bits=4, group_size=64, group_dim=2, symmetric=False)) - model = OptLM(f"facebook/{shared.model_name}", env, shared.args.model_dir, policy) + model = OptLM(f"facebook/{model_name}", env, shared.args.model_dir, policy) # DeepSpeed ZeRO-3 elif shared.args.deepspeed: - model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16) + model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16) model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0] model.module.eval() # Inference print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}") @@ -169,7 +169,7 @@ def load_model(model_name): if shared.args.disk: params["offload_folder"] = shared.args.disk_cache_dir - checkpoint = Path(f'{shared.args.model_dir}/{shared.model_name}') + checkpoint = Path(f'{shared.args.model_dir}/{model_name}') if shared.args.load_in_8bit and params.get('max_memory', None) is not None and params['device_map'] == 'auto': config = AutoConfig.from_pretrained(checkpoint) @@ -190,7 +190,7 @@ def load_model(model_name): llama_attn_hijack.hijack_llama_attention() # Loading the tokenizer - if any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): + if any((k in model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif type(model) is transformers.LlamaForCausalLM: tokenizer = None @@ -205,7 +205,7 @@ def load_model(model_name): # Otherwise, load it from the model folder and hope that these # are not outdated tokenizer files. if tokenizer is None: - tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}/"), clean_up_tokenization_spaces=True) + tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), clean_up_tokenization_spaces=True) try: tokenizer.eos_token_id = 2 tokenizer.bos_token_id = 1 @@ -213,7 +213,7 @@ def load_model(model_name): except: pass else: - tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}/"), trust_remote_code=trust_remote_code) + tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), trust_remote_code=trust_remote_code) print(f"Loaded the model in {(time.time()-t0):.2f} seconds.") return model, tokenizer diff --git a/modules/training.py b/modules/training.py index 1a12a0e4..000a1cea 100644 --- a/modules/training.py +++ b/modules/training.py @@ -10,9 +10,12 @@ import gradio as gr import torch import transformers from datasets import Dataset, load_dataset -from peft import LoraConfig, get_peft_model, set_peft_model_state_dict, prepare_model_for_int8_training +from peft import (LoraConfig, get_peft_model, prepare_model_for_int8_training, + set_peft_model_state_dict) from modules import shared, ui +from modules.evaluate import calculate_perplexity, generate_markdown_table, save_past_evaluations +from server import get_available_loras, get_available_models # This mapping is from a very recent commit, not yet released. # If not available, default to a backup map for the 3 safe model types. @@ -40,10 +43,6 @@ def get_datasets(path: str, ext: str): return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=str.lower) -def get_available_loras(): - return ['None'] + sorted([item.name for item in list(Path(shared.args.lora_dir).glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=str.lower) - - def create_train_interface(): with gr.Tab('Train LoRA', elem_id='lora-train-tab'): with gr.Row(): @@ -82,9 +81,9 @@ def create_train_interface(): eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.') - with gr.Tab(label='Raw Text File'): + with gr.Tab(label="Raw text file"): with gr.Row(): - raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text File', info='The raw text file to use for training.') + raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.') ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'txt')}, 'refresh-button') with gr.Row(): @@ -106,11 +105,48 @@ def create_train_interface(): output = gr.Markdown(value="Ready") - all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, overlap_len, newline_favor_len, do_shuffle, higher_rank_limit, warmup_steps, optimizer] - copy_from.change(do_copy_params, [copy_from] + all_params, all_params) - start_button.click(do_train, all_params, output) - stop_button.click(do_interrupt, None, None, queue=False) - higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha]) + with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'): + with gr.Row(): + with gr.Column(): + models = gr.Dropdown(get_available_models(), label='Models', multiselect=True) + evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.') + with gr.Row(): + stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') + max_length = gr.Slider(label='max_length', minimum=1, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') + + with gr.Row(): + start_current_evaluation = gr.Button("Evaluate loaded model") + start_evaluation = gr.Button("Evaluate selected models") + stop_evaluation = gr.Button("Interrupt") + + with gr.Column(): + evaluation_log = gr.Markdown(value = '') + + evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True) + save_comments = gr.Button('Save comments') + + # Training events + all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, overlap_len, newline_favor_len, do_shuffle, higher_rank_limit, warmup_steps, optimizer] + copy_from.change(do_copy_params, [copy_from] + all_params, all_params) + start_button.click(do_train, all_params, output) + stop_button.click(do_interrupt, None, None, queue=False) + higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha]) + + # Evaluation events. For some reason, the interrupt event + # doesn't work with the .then() syntax, so I write them one + # by one in this ugly but functional way. + ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False) + start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False) + + tmp = gr.State('') + start_current_evaluation.click(lambda: ['current model'], None, tmp) + ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False) + start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False) + + stop_evaluation.click(None, None, None, cancels=[ev, ev_cur], queue=False) + save_comments.click( + save_past_evaluations, evaluation_table, None).then( + lambda: "Comments saved.", None, evaluation_log, show_progress=False) def do_interrupt(): @@ -133,6 +169,7 @@ def do_copy_params(lora_name: str, *args): result.append(params[key]) else: result.append(args[i]) + return result @@ -155,7 +192,8 @@ def clean_path(base_path: str, path: str): def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, do_shuffle: bool, higher_rank_limit: bool, warmup_steps: int, optimizer: str): if shared.args.monkey_patch: - from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model + from monkeypatch.peft_tuners_lora_monkey_patch import \ + replace_peft_model_with_gptq_lora_model replace_peft_model_with_gptq_lora_model() global WANT_INTERRUPT @@ -300,6 +338,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch if '4bit' in str(type(m)): if m.is_v1_model: m.zeros = m.zeros.half() + m.scales = m.scales.half() class Tracked(): diff --git a/modules/ui.py b/modules/ui.py index 121b6c5a..d84cbacc 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -20,7 +20,8 @@ theme = gr.themes.Default( font_mono=['IBM Plex Mono', 'ui-monospace', 'Consolas', 'monospace'], ).set( border_color_primary='#c5c5d2', - button_large_padding='6px 12px' + button_large_padding='6px 12px', + body_text_color_subdued='#484848' ) def list_model_elements(): diff --git a/requirements.txt b/requirements.txt index 6c7e22ec..e5f0a8f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,12 +5,13 @@ flexgen==0.1.7 gradio==3.25.0 markdown numpy +pandas Pillow>=9.5.0 +pyyaml requests rwkv==0.7.3 safetensors==0.3.0 sentencepiece -pyyaml tqdm git+https://github.com/huggingface/peft transformers==4.28.1 From 2d766d2e1945896f27486b1e1d764cbaa31f4af3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 02:35:28 -0300 Subject: [PATCH 03/33] Improve notebook mode button sizes --- css/main.css | 4 ++++ server.py | 13 ++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/css/main.css b/css/main.css index 547868ed..82ca96fa 100644 --- a/css/main.css +++ b/css/main.css @@ -107,3 +107,7 @@ footer { button { font-size: 14px !important; } + +.small-button { + max-width: 171px; +} diff --git a/server.py b/server.py index bc4f73f4..5cbce5e0 100644 --- a/server.py +++ b/server.py @@ -615,15 +615,10 @@ def create_interface(): shared.gradio['html'] = gr.HTML() with gr.Row(): - with gr.Column(): - with gr.Row(): - shared.gradio['Generate'] = gr.Button('Generate', variant='primary') - shared.gradio['Stop'] = gr.Button('Stop') - shared.gradio['Undo'] = gr.Button('Undo') - shared.gradio['Regenerate'] = gr.Button('Regenerate') - - with gr.Column(): - pass + shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button") + shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button") + shared.gradio['Undo'] = gr.Button('Undo', elem_classes="small-button") + shared.gradio['Regenerate'] = gr.Button('Regenerate', elem_classes="small-button") with gr.Column(scale=1): gr.HTML('
') From 5e023ae64d1cdbabffd9c2c50359fc82c0665ab5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 02:47:18 -0300 Subject: [PATCH 04/33] Change dropdown menu highlight color --- modules/ui.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/ui.py b/modules/ui.py index d84cbacc..5db36b3e 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -21,7 +21,8 @@ theme = gr.themes.Default( ).set( border_color_primary='#c5c5d2', button_large_padding='6px 12px', - body_text_color_subdued='#484848' + body_text_color_subdued='#484848', + background_fill_secondary='#eaeaea' ) def list_model_elements(): From d46b9b7c50d2b47fc12302420c9048d5ffd1c3d0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:34:08 -0300 Subject: [PATCH 05/33] Fix evaluate comment saving --- modules/evaluate.py | 2 ++ modules/shared.py | 1 + modules/training.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/evaluate.py b/modules/evaluate.py index 9822ddea..3134280c 100644 --- a/modules/evaluate.py +++ b/modules/evaluate.py @@ -24,6 +24,8 @@ past_evaluations = load_past_evaluations() def save_past_evaluations(df): + global past_evaluations + past_evaluations = df df.to_csv(Path('logs/evaluations.csv'), index=False) diff --git a/modules/shared.py b/modules/shared.py index a08f134f..d6cdc0bc 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -123,6 +123,7 @@ parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized m parser.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') parser.add_argument('--groupsize', type=int, default=-1, help='Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.') +parser.add_argument('--file-path', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.') parser.add_argument('--no-quant_attn', action='store_true', help='(triton) Disable quant attention. If you encounter incoherent results try disabling this.') parser.add_argument('--no-warmup_autotune', action='store_true', help='(triton) Disable warmup autotune.') diff --git a/modules/training.py b/modules/training.py index 000a1cea..70629ef3 100644 --- a/modules/training.py +++ b/modules/training.py @@ -112,7 +112,7 @@ def create_train_interface(): evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.') with gr.Row(): stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') - max_length = gr.Slider(label='max_length', minimum=1, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') + max_length = gr.Slider(label='max_length', minimum=0, maximum=8096, value=0, step=1, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') with gr.Row(): start_current_evaluation = gr.Button("Evaluate loaded model") From eddd0164495c728066ff645b67f670cc39fc4965 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:41:27 -0300 Subject: [PATCH 06/33] Minor deletion --- modules/shared.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/shared.py b/modules/shared.py index d6cdc0bc..a08f134f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -123,7 +123,6 @@ parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized m parser.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') parser.add_argument('--groupsize', type=int, default=-1, help='Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.') -parser.add_argument('--file-path', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.') parser.add_argument('--no-quant_attn', action='store_true', help='(triton) Disable quant attention. If you encounter incoherent results try disabling this.') parser.add_argument('--no-warmup_autotune', action='store_true', help='(triton) Disable warmup autotune.') From e1aa9d5173368015fd7e21d582d4fe620adee901 Mon Sep 17 00:00:00 2001 From: USBhost Date: Fri, 21 Apr 2023 10:43:56 -0500 Subject: [PATCH 07/33] Support upstream GPTQ once again. (#1451) --- modules/GPTQ_loader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index fc70e5e3..3379d27a 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -12,7 +12,11 @@ import modules.shared as shared sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) import llama_inference_offload -from modelutils import find_layers + +try: + from modelutils import find_layers +except ImportError: + from utils import find_layers try: from quant import make_quant From a6ef2429fa5a23de0bb1a28e50361f282daca9a2 Mon Sep 17 00:00:00 2001 From: Lou Bernardi Date: Fri, 21 Apr 2023 11:54:50 -0400 Subject: [PATCH 08/33] Add "do not download" and "download from HF" to download-model.py (#1439) --- download-model.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index c35cf831..a095f398 100644 --- a/download-model.py +++ b/download-model.py @@ -41,13 +41,17 @@ def select_model_from_default_options(): char = chr(ord('A') + i) choices[char] = name print(f"{char}) {name}") - char = chr(ord('A') + len(models)) - print(f"{char}) None of the above") + char_hugging = chr(ord('A') + len(models)) + print(f"{char_hugging}) Manually specify a Hugging Face model") + char_exit = chr(ord('A') + len(models) + 1) + print(f"{char_exit}) Do not download a model") print() print("Input> ", end='') choice = input()[0].strip().upper() - if choice == char: + if choice == char_exit: + exit() + elif choice == char_hugging: print("""\nThen type the name of your desired Hugging Face model in the format organization/name. Examples: From c238ba9532d0b504005f2557827291a993aad95a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:18:34 -0300 Subject: [PATCH 09/33] Add a 'Count tokens' button --- server.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/server.py b/server.py index 5cbce5e0..c79d63ec 100644 --- a/server.py +++ b/server.py @@ -1,7 +1,8 @@ import os -import requests import warnings +import requests + os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False' os.environ['BITSANDBYTES_NOWELCOME'] = '1' warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated') @@ -15,10 +16,12 @@ def my_get(url, **kwargs): original_get = requests.get requests.get = my_get import gradio as gr + requests.get = original_get # This fixes LaTeX rendering on some systems import matplotlib + matplotlib.use('Agg') import importlib @@ -44,7 +47,8 @@ from modules import api, chat, shared, training, ui from modules.html_generator import chat_html_wrapper from modules.LoRA import add_lora_to_model from modules.models import load_model, load_soft_prompt, unload_model -from modules.text_generation import generate_reply, stop_everything_event +from modules.text_generation import (encode, generate_reply, + stop_everything_event) def get_available_models(): @@ -172,6 +176,11 @@ def load_prompt(fname): return text +def count_tokens(text): + tokens = len(encode(text)[0]) + return f'{tokens} tokens in the input.' + + def download_model_wrapper(repo_id): try: downloader = importlib.import_module("download-model") @@ -628,6 +637,7 @@ def create_interface(): ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': get_available_prompts()}, 'refresh-button') shared.gradio['save_prompt'] = gr.Button('Save prompt') + shared.gradio['count_tokens'] = gr.Button('Count tokens') shared.gradio['status'] = gr.Markdown('') with gr.Tab("Parameters", elem_id="parameters"): @@ -644,10 +654,11 @@ def create_interface(): shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_classes="textbox_default", lines=27, label='Input') shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) with gr.Row(): - shared.gradio['Generate'] = gr.Button('Generate', variant='primary') - shared.gradio['Stop'] = gr.Button('Stop') - shared.gradio['Continue'] = gr.Button('Continue') - shared.gradio['save_prompt'] = gr.Button('Save prompt') + shared.gradio['Generate'] = gr.Button('Generate', variant='primary', elem_classes="small-button") + shared.gradio['Stop'] = gr.Button('Stop', elem_classes="small-button") + shared.gradio['Continue'] = gr.Button('Continue', elem_classes="small-button") + shared.gradio['save_prompt'] = gr.Button('Save prompt', elem_classes="small-button") + shared.gradio['count_tokens'] = gr.Button('Count tokens', elem_classes="small-button") with gr.Row(): with gr.Column(): @@ -838,8 +849,9 @@ def create_interface(): ) shared.gradio['Stop'].click(stop_everything_event, None, None, queue=False, cancels=gen_events if shared.args.no_stream else None) - shared.gradio['prompt_menu'].change(load_prompt, [shared.gradio['prompt_menu']], [shared.gradio['textbox']], show_progress=False) - shared.gradio['save_prompt'].click(save_prompt, [shared.gradio['textbox']], [shared.gradio['status']], show_progress=False) + shared.gradio['prompt_menu'].change(load_prompt, shared.gradio['prompt_menu'], shared.gradio['textbox'], show_progress=False) + shared.gradio['save_prompt'].click(save_prompt, shared.gradio['textbox'], shared.gradio['status'], show_progress=False) + shared.gradio['count_tokens'].click(count_tokens, shared.gradio['textbox'], shared.gradio['status'], show_progress=False) shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.main_js}}}") # Launch the interface From 2dca8bb25e2049d9a563833914ba45e07ac74d64 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:20:59 -0300 Subject: [PATCH 10/33] Sort imports --- server.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/server.py b/server.py index c79d63ec..2de817cb 100644 --- a/server.py +++ b/server.py @@ -1,7 +1,6 @@ import os -import warnings - import requests +import warnings os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False' os.environ['BITSANDBYTES_NOWELCOME'] = '1' @@ -16,12 +15,10 @@ def my_get(url, **kwargs): original_get = requests.get requests.get = my_get import gradio as gr - requests.get = original_get # This fixes LaTeX rendering on some systems import matplotlib - matplotlib.use('Agg') import importlib From 143e88694dd655365f13285205d29da98858195c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=A6=CF=86?= <42910943+Brawlence@users.noreply.github.com> Date: Fri, 21 Apr 2023 23:49:18 +0300 Subject: [PATCH 11/33] SD_api_pictures: Modefix, +hires options, UI layout change (#1400) --- css/main.css | 26 +++++++++++++++++ extensions/sd_api_pictures/README.MD | 2 +- extensions/sd_api_pictures/script.py | 42 +++++++++++++++++++++------- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/css/main.css b/css/main.css index 82ca96fa..cdde2705 100644 --- a/css/main.css +++ b/css/main.css @@ -111,3 +111,29 @@ button { .small-button { max-width: 171px; } + +/* Align the elements for SD_api_picture extension */ +.SDAP #sampler_box { + padding-top: var(--spacing-sm); + padding-bottom: var(--spacing-sm); +} + +.SDAP #seed_box, +.SDAP #cfg_box { + padding-top: var(--spacing-md); +} + +.SDAP #sampler_box span, +.SDAP #seed_box span, +.SDAP #cfg_box span{ + margin-bottom: var(--spacing-sm); +} + +.SDAP svg.dropdown-arrow { + flex-shrink: 0 !important; + margin: 0px !important; +} + +.SDAP .hires_opts input[type="number"] { + width: 6em !important; +} \ No newline at end of file diff --git a/extensions/sd_api_pictures/README.MD b/extensions/sd_api_pictures/README.MD index cf2713de..67c75e14 100644 --- a/extensions/sd_api_pictures/README.MD +++ b/extensions/sd_api_pictures/README.MD @@ -1,7 +1,7 @@ ## Description: TL;DR: Lets the bot answer you with a picture! -Stable Diffusion API pictures for TextGen, v.1.1.1 +Stable Diffusion API pictures for TextGen, v.1.2.0 An extension to [oobabooga's textgen-webui](https://github.com/oobabooga/text-generation-webui) allowing you to receive pics generated by [Automatic1111's SD-WebUI API](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
diff --git a/extensions/sd_api_pictures/script.py b/extensions/sd_api_pictures/script.py index 9bba3d7d..1189a593 100644 --- a/extensions/sd_api_pictures/script.py +++ b/extensions/sd_api_pictures/script.py @@ -25,7 +25,11 @@ params = { 'negative_prompt': '(worst quality, low quality:1.3)', 'width': 512, 'height': 512, + 'denoising_strength': 0.61, 'restore_faces': False, + 'enable_hr': False, + 'hr_upscaler': 'ESRGAN_4x', + 'hr_scale': '1.0', 'seed': -1, 'sampler_name': 'DDIM', 'steps': 32, @@ -74,7 +78,6 @@ SD_models = ['NeverEndingDream'] # TODO: get with http://{address}}/sdapi/v1/sd streaming_state = shared.args.no_stream # remember if chat streaming was enabled picture_response = False # specifies if the next model response should appear as a picture - def remove_surrounded_chars(string): # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' @@ -123,11 +126,16 @@ def get_SD_pictures(description): "prompt": params['prompt_prefix'] + description, "seed": params['seed'], "sampler_name": params['sampler_name'], + "enable_hr": params['enable_hr'], + "hr_scale": params['hr_scale'], + "hr_upscaler": params['hr_upscaler'], + "denoising_strength": params['denoising_strength'], "steps": params['steps'], "cfg_scale": params['cfg_scale'], "width": params['width'], "height": params['height'], "restore_faces": params['restore_faces'], + "override_settings_restore_afterwards": True, "negative_prompt": params['negative_prompt'] } @@ -246,15 +254,15 @@ def SD_api_address_update(address): return gr.Textbox.update(label=msg) - def ui(): # Gradio elements # gr.Markdown('### Stable Diffusion API Pictures') # Currently the name of extension is shown as the title - with gr.Accordion("Parameters", open=True): + with gr.Accordion("Parameters", open=True, elem_classes="SDAP"): with gr.Row(): address = gr.Textbox(placeholder=params['address'], value=params['address'], label='Auto1111\'s WebUI address') - mode = gr.Dropdown(["Manual", "Immersive/Interactive", "Picturebook/Adventure"], value="Manual", label="Mode of operation", type="index") + modes_list = ["Manual", "Immersive/Interactive", "Picturebook/Adventure"] + mode = gr.Dropdown(modes_list, value=modes_list[params['mode']], label="Mode of operation", type="index") with gr.Column(scale=1, min_width=300): manage_VRAM = gr.Checkbox(value=params['manage_VRAM'], label='Manage VRAM') save_img = gr.Checkbox(value=params['save_img'], label='Keep original images and use them in chat') @@ -264,17 +272,25 @@ def ui(): with gr.Accordion("Generation parameters", open=False): prompt_prefix = gr.Textbox(placeholder=params['prompt_prefix'], value=params['prompt_prefix'], label='Prompt Prefix (best used to describe the look of the character)') + negative_prompt = gr.Textbox(placeholder=params['negative_prompt'], value=params['negative_prompt'], label='Negative Prompt') with gr.Row(): - with gr.Column(): - negative_prompt = gr.Textbox(placeholder=params['negative_prompt'], value=params['negative_prompt'], label='Negative Prompt') - sampler_name = gr.Textbox(placeholder=params['sampler_name'], value=params['sampler_name'], label='Sampler') with gr.Column(): width = gr.Slider(256, 768, value=params['width'], step=64, label='Width') height = gr.Slider(256, 768, value=params['height'], step=64, label='Height') + with gr.Column(): + sampler_name = gr.Textbox(placeholder=params['sampler_name'], value=params['sampler_name'], label='Sampling method', elem_id="sampler_box") + steps = gr.Slider(1, 150, value=params['steps'], step=1, label="Sampling steps") with gr.Row(): - steps = gr.Number(label="Steps:", value=params['steps']) - seed = gr.Number(label="Seed:", value=params['seed']) - cfg_scale = gr.Number(label="CFG Scale:", value=params['cfg_scale']) + seed = gr.Number(label="Seed", value=params['seed'], elem_id="seed_box") + cfg_scale = gr.Number(label="CFG Scale", value=params['cfg_scale'], elem_id="cfg_box") + with gr.Column() as hr_options: + restore_faces = gr.Checkbox(value=params['restore_faces'], label='Restore faces') + enable_hr = gr.Checkbox(value=params['enable_hr'], label='Hires. fix') + with gr.Row(visible=params['enable_hr'], elem_classes="hires_opts") as hr_options: + hr_scale = gr.Slider(1, 4, value=params['hr_scale'], step=0.1, label='Upscale by') + denoising_strength = gr.Slider(0, 1, value=params['denoising_strength'], step=0.01, label='Denoising strength') + hr_upscaler = gr.Textbox(placeholder=params['hr_upscaler'], value=params['hr_upscaler'], label='Upscaler') + # Event functions to update the parameters in the backend address.change(lambda x: params.update({"address": filter_address(x)}), address, None) @@ -289,6 +305,12 @@ def ui(): negative_prompt.change(lambda x: params.update({"negative_prompt": x}), negative_prompt, None) width.change(lambda x: params.update({"width": x}), width, None) height.change(lambda x: params.update({"height": x}), height, None) + hr_scale.change(lambda x: params.update({"hr_scale": x}), hr_scale, None) + denoising_strength.change(lambda x: params.update({"denoising_strength": x}), denoising_strength, None) + restore_faces.change(lambda x: params.update({"restore_faces": x}), restore_faces, None) + hr_upscaler.change(lambda x: params.update({"hr_upscaler": x}), hr_upscaler, None) + enable_hr.change(lambda x: params.update({"enable_hr": x}), enable_hr, None) + enable_hr.change(lambda x: hr_options.update(visible=params["enable_hr"]), enable_hr, hr_options) sampler_name.change(lambda x: params.update({"sampler_name": x}), sampler_name, None) steps.change(lambda x: params.update({"steps": x}), steps, None) From 505c2c73e86268faaa3dc1eff982703c234061c8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 00:11:27 -0300 Subject: [PATCH 12/33] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a03598a4..eaadebda 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. * [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model) * [LoRA (loading and training)](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs) * Softprompts -* [Extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions) +* [Extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions) - see the [user extensions list](https://github.com/oobabooga/text-generation-webui-extensions) ## Installation From 25b433990a960516adfb70549dcda9992e409ec0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:33:32 -0300 Subject: [PATCH 13/33] Create README.md --- docs/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..e7123041 --- /dev/null +++ b/docs/README.md @@ -0,0 +1 @@ +wip From 80ef7c7bcb987d4bf4b9336006fa759db63e1b9c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:34:13 -0300 Subject: [PATCH 14/33] Add files via upload --- docs/Custom-chat-characters.md | 30 ++++++ docs/DeepSpeed.md | 23 +++++ docs/Extensions.md | 157 +++++++++++++++++++++++++++++ docs/FlexGen.md | 64 ++++++++++++ docs/GPTQ-models-(4-bit-mode).md | 128 +++++++++++++++++++++++ docs/Home.md | 1 + docs/LLaMA-model.md | 45 +++++++++ docs/Low-VRAM-guide.md | 51 ++++++++++ docs/RWKV-model.md | 54 ++++++++++ docs/Spell-book.md | 111 ++++++++++++++++++++ docs/System-requirements.md | 42 ++++++++ docs/Using-LoRAs.md | 88 ++++++++++++++++ docs/WSL-installation-guide.md | 73 ++++++++++++++ docs/Windows-installation-guide.md | 9 ++ docs/llama.cpp-models.md | 35 +++++++ 15 files changed, 911 insertions(+) create mode 100644 docs/Custom-chat-characters.md create mode 100644 docs/DeepSpeed.md create mode 100644 docs/Extensions.md create mode 100644 docs/FlexGen.md create mode 100644 docs/GPTQ-models-(4-bit-mode).md create mode 100644 docs/Home.md create mode 100644 docs/LLaMA-model.md create mode 100644 docs/Low-VRAM-guide.md create mode 100644 docs/RWKV-model.md create mode 100644 docs/Spell-book.md create mode 100644 docs/System-requirements.md create mode 100644 docs/Using-LoRAs.md create mode 100644 docs/WSL-installation-guide.md create mode 100644 docs/Windows-installation-guide.md create mode 100644 docs/llama.cpp-models.md diff --git a/docs/Custom-chat-characters.md b/docs/Custom-chat-characters.md new file mode 100644 index 00000000..b25a2294 --- /dev/null +++ b/docs/Custom-chat-characters.md @@ -0,0 +1,30 @@ +Custom chat mode characters are defined by `.yaml` files inside the `characters` folder. An example is included: [Example.yaml](https://github.com/oobabooga/text-generation-webui/blob/main/characters/Example.yaml) + +The following fields may be defined: + +| Field | Description | +|-------|-------------| +| `name` | The character's name. | +| `context` | A string that appears at the top of the prompt. It usually contains a description of the character's personality. | +| `greeting` (optional) | The character's opening message when a new conversation is started. | +| `example_dialogue` (optional) | A few example messages to guide the model. | +| `your_name` (optional) | Your name. This overwrites what you had previously written in the `Your name` field in the interface. | + +#### Special tokens + +* `{{char}}` or ``: are replaced with the character's name +* `{{user}}` or ``: are replaced with your name + +These replacements happen when the character is loaded, and they apply to the `context`, `greeting`, and `example_dialogue` fields. + +#### How do I add a profile picture for my character? + +Put an image with the same name as your character's yaml file into the `characters` folder. For example, if your bot is `Character.yaml`, add `Character.jpg` or `Character.png` to the folder. + +#### Is the chat history truncated in the prompt? + +Once your prompt reaches the 2048 token limit, old messages will be removed one at a time. The context string will always stay at the top of the prompt and will never get truncated. + +#### Pygmalion format characters + +These are also supported out of the box. Simply put the JSON file in the `characters` folder, or upload it directly from the web UI by clicking on the "Upload character" tab at the bottom. \ No newline at end of file diff --git a/docs/DeepSpeed.md b/docs/DeepSpeed.md new file mode 100644 index 00000000..70cd8151 --- /dev/null +++ b/docs/DeepSpeed.md @@ -0,0 +1,23 @@ +An alternative way of reducing the GPU memory usage of models is to use the `DeepSpeed ZeRO-3` optimization. + +With this, I have been able to load a 6b model (GPT-J 6B) with less than 6GB of VRAM. The speed of text generation is very decent and much better than what would be accomplished with `--auto-devices --gpu-memory 6`. + +As far as I know, DeepSpeed is only available for Linux at the moment. + +### How to use it + +1. Install DeepSpeed: + +``` +pip install deepspeed +``` + +2. Start the web UI replacing `python` with `deepspeed --num_gpus=1` and adding the `--deepspeed` flag. Example: + +``` +deepspeed --num_gpus=1 server.py --deepspeed --chat --model gpt-j-6B +``` + +### Learn more + +For more information, check out [this comment](https://github.com/oobabooga/text-generation-webui/issues/40#issuecomment-1412038622) by 81300, who came up with the DeepSpeed support in this web UI. \ No newline at end of file diff --git a/docs/Extensions.md b/docs/Extensions.md new file mode 100644 index 00000000..184ace55 --- /dev/null +++ b/docs/Extensions.md @@ -0,0 +1,157 @@ +This web UI supports extensions. They are simply files under + +``` +extensions/your_extension_name/script.py +``` + +which can be invoked with the + +``` +--extension your_extension_name +``` + +command-line flag. + +## [text-generation-webui-extensions](https://github.com/oobabooga/text-generation-webui-extensions) + +The link above contains a directory of user extensions for text-generation-webui. + +## Built-in extensions + +|Extension|Description| +|---------|-----------| +|[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.| +|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that biases the bot's responses in chat mode.| +|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. | +|[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, it replaces the responses with an audio widget. | +|[elevenlabs_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/elevenlabs_tts)| Text-to-speech extension using the [ElevenLabs](https://beta.elevenlabs.io/) API. You need an API key to use it. Author: [@MetaIX](https://github.com/MetaIX). | +|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. Author: [@SillyLossy](https://github.com/sillylossy).| +|[api](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/api)| Creates an API similar to the one provided by KoboldAI. Works with TavernAI: start the web UI with `python server.py --no-stream --extensions api` and set the API URL to `http://127.0.0.1:5000/api`. Author: [@mayaeary](https://github.com/mayaeary).| +|[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. Author: [@EliasVincent](https://github.com/EliasVincent).| +|[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). Author: [@Brawlence](https://github.com/Brawlence).| + +## How to write an extension + +`script.py` has access to all variables in the UI through the `modules.shared` module, and it may define the following functions: + +| Function | Description | +|-------------|-------------| +| `def ui()` | Creates custom gradio elements when the UI is launched. | +| `def input_modifier(string)` | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. | +| `def output_modifier(string)` | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. | +| `def bot_prefix_modifier(string)` | Applied in chat mode to the prefix for the bot's reply (more on that below). | +| `def custom_generate_chat_prompt(...)` | Overrides the prompt generator in chat mode. | + +Additionally, the script may define two special global variables: + +#### `params` dictionary + +```python +params = { + "language string": "ja", +} +``` + +This dicionary can be used to make the extension parameters customizable by adding entries to a `settings.json` file like this: + +```python +"google_translate-language string": "fr", +``` + +#### `input_hijack` dictionary + +```python +input_hijack = { + 'state': False, + 'value': ["", ""] +} +``` +This is only relevant in chat mode. If your extension sets `input_hijack['state']` to `True` at any moment, the next call to `modules.chat.chatbot_wrapper` will use the vales inside `input_hijack['value']` as the user input for text generation. See the `send_pictures` extension above for an example. + +## The `bot_prefix_modifier` + +In chat mode, this function modifies the prefix for a new bot message. For instance, if your bot is named `Marie Antoinette`, the default prefix for a new message will be + +``` +Marie Antoinette: +``` + +Using `bot_prefix_modifier`, you can change it to: + +``` +Marie Antoinette: *I am very enthusiastic* +``` + +Marie Antoinette will become very enthusiastic in all her messages. + +## Using multiple extensions at the same time + +In order to use your extension, you must start the web UI with the `--extensions` flag followed by the name of your extension (the folder under `text-generation-webui/extension` where `script.py` resides). + +You can activate more than one extension at a time by providing their names separated by spaces. The input, output and bot prefix modifiers will be applied in the specified order. For `custom_generate_chat_prompt`, only the first declaration encountered will be used and the rest will be ignored. + +``` +python server.py --extensions enthusiasm translate # First apply enthusiasm, then translate +python server.py --extensions translate enthusiasm # First apply translate, then enthusiasm +``` + +## `custom_generate_chat_prompt` example + +Below is an extension that just reproduces the default prompt generator in `modules/chat.py`. You can modify it freely to come up with your own prompts in chat mode. + +```python +def custom_generate_chat_prompt(user_input, state, **kwargs): + impersonate = kwargs['impersonate'] if 'impersonate' in kwargs else False + _continue = kwargs['_continue'] if '_continue' in kwargs else False + also_return_rows = kwargs['also_return_rows'] if 'also_return_rows' in kwargs else False + is_instruct = state['mode'] == 'instruct' + rows = [f"{state['context'].strip()}\n"] + + # Finding the maximum prompt size + chat_prompt_size = state['chat_prompt_size'] + if shared.soft_prompt: + chat_prompt_size -= shared.soft_prompt_tensor.shape[1] + max_length = min(get_max_prompt_length(state), chat_prompt_size) + + if is_instruct: + prefix1 = f"{state['name1']}\n" + prefix2 = f"{state['name2']}\n" + else: + prefix1 = f"{state['name1']}: " + prefix2 = f"{state['name2']}: " + + i = len(shared.history['internal']) - 1 + while i >= 0 and len(encode(''.join(rows))[0]) < max_length: + if _continue and i == len(shared.history['internal']) - 1: + rows.insert(1, f"{prefix2}{shared.history['internal'][i][1]}") + else: + rows.insert(1, f"{prefix2}{shared.history['internal'][i][1].strip()}{state['end_of_turn']}\n") + string = shared.history['internal'][i][0] + if string not in ['', '<|BEGIN-VISIBLE-CHAT|>']: + rows.insert(1, f"{prefix1}{string.strip()}{state['end_of_turn']}\n") + i -= 1 + + if impersonate: + rows.append(f"{prefix1.strip() if not is_instruct else prefix1}") + limit = 2 + elif _continue: + limit = 3 + else: + # Adding the user message + user_input = fix_newlines(user_input) + if len(user_input) > 0: + rows.append(f"{prefix1}{user_input}{state['end_of_turn']}\n") + + # Adding the Character prefix + rows.append(apply_extensions(f"{prefix2.strip() if not is_instruct else prefix2}", "bot_prefix")) + limit = 3 + + while len(rows) > limit and len(encode(''.join(rows))[0]) >= max_length: + rows.pop(1) + prompt = ''.join(rows) + + if also_return_rows: + return prompt, rows + else: + return prompt +``` diff --git a/docs/FlexGen.md b/docs/FlexGen.md new file mode 100644 index 00000000..dce71f9e --- /dev/null +++ b/docs/FlexGen.md @@ -0,0 +1,64 @@ +>FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!). + +https://github.com/FMInference/FlexGen + +## Installation + +No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project. + +## Converting a model + +FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI: + +``` +python convert-to-flexgen.py models/opt-1.3b/ +``` + +The output will be saved to `models/opt-1.3b-np/`. + +## Usage + +The basic command is the following: + +``` +python server.py --model opt-1.3b --flexgen +``` + +For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this: + +``` +python server.py --model opt-1.3b --flexgen --compress-weight +``` + +With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases. + +You can also manually set the offload strategy with + +``` +python server.py --model opt-1.3b --flexgen --percent 0 100 100 0 100 0 +``` + +where the six numbers after `--percent` are: + +``` +the percentage of weight on GPU +the percentage of weight on CPU +the percentage of attention cache on GPU +the percentage of attention cache on CPU +the percentage of activations on GPU +the percentage of activations on CPU +``` + +You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder. + +## Performance + +In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results: + +* `--flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token. +* `--flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token. + +## Limitations + +* Only works with the OPT models. +* Only two generation parameters are available: `temperature` and `do_sample`. \ No newline at end of file diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md new file mode 100644 index 00000000..9ed7cc37 --- /dev/null +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -0,0 +1,128 @@ +In 4-bit mode, models are loaded with just 25% of their regular VRAM usage. So LLaMA-7B fits into a 6GB GPU, and LLaMA-30B fits into a 24GB GPU. + +This is possible thanks to [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa)'s adaptation of the GPTQ algorithm for LLaMA: https://github.com/qwopqwop200/GPTQ-for-LLaMa + +GPTQ is a clever quantization algorithm that lightly reoptimizes the weights during quantization so that the accuracy loss is compensated relative to a round-to-nearest quantization. See the paper for more details: https://arxiv.org/abs/2210.17323 + +## Installation + +### Step 0: install nvcc + +``` +conda activate textgen +conda install -c conda-forge cudatoolkit-dev +``` + +The command above takes some 10 minutes to run and shows no progress bar or updates along the way. + +See this issue for more details: https://github.com/oobabooga/text-generation-webui/issues/416#issuecomment-1475078571 + +### Step 1: install GPTQ-for-LLaMa + +Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it: + +``` +mkdir repositories +cd repositories +git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda +cd GPTQ-for-LLaMa +python setup_cuda.py install +``` + +You are going to need to have a C++ compiler installed into your system for the last command. On Linux, `sudo apt install build-essential` or equivalent is enough. + +https://github.com/oobabooga/GPTQ-for-LLaMa corresponds to commit `a6f363e3f93b9fb5c26064b5ac7ed58d22e3f773` in the `cuda` branch of the original repository and is recommended by default for stability. Some models might require you to use the up-to-date CUDA or triton branches: + +``` +cd repositories +rm -r GPTQ-for-LLaMa +pip uninstall -y quant-cuda +git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda +... +``` + +``` +cd repositories +rm -r GPTQ-for-LLaMa +pip uninstall -y quant-cuda +git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton +... +``` + + +https://github.com/qwopqwop200/GPTQ-for-LLaMa + +### Step 2: get the pre-converted weights + +* Converted without `group-size` (better for the 7b model): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617 +* Converted with `group-size` (better from 13b upwards): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105 + +Note: the tokenizer files in those torrents are not up to date. + +### Step 3: Start the web UI: + +For the models converted without `group-size`: + +``` +python server.py --model llama-7b-4bit +``` + +For the models converted with `group-size`: + +``` +python server.py --model llama-13b-4bit-128g +``` + +The command-line flags `--wbits` and `--groupsize` are automatically detected based on the folder names, but you can also specify them manually like + +``` +python server.py --model llama-13b-4bit-128g --wbits 4 --groupsize 128 +``` + +## CPU offloading + +It is possible to offload part of the layers of the 4-bit model to the CPU with the `--pre_layer` flag. The higher the number after `--pre_layer`, the more layers will be allocated to the GPU. + +With this command, I can run llama-7b with 4GB VRAM: + +``` +python server.py --model llama-7b-4bit --pre_layer 20 +``` + +This is the performance: + +``` +Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens) +``` + +## Using LoRAs in 4-bit mode + +At the moment, this feature is not officially supported by the relevant libraries, but a patch exists and is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit + +In order to use it: + +1. Make sure that your requirements are up to date: + +``` +cd text-generation-webui +pip install -r requirements.txt --upgrade +``` + +2. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder: + +``` +cd text-generation-webui/repositories +git clone https://github.com/johnsmith0031/alpaca_lora_4bit +``` + +3. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command: + +``` +pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit +``` + +4. Start the UI with the `--monkey-patch` flag: + +``` +python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch +``` diff --git a/docs/Home.md b/docs/Home.md new file mode 100644 index 00000000..8448d13c --- /dev/null +++ b/docs/Home.md @@ -0,0 +1 @@ +Welcome to the text-generation-webui wiki! diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md new file mode 100644 index 00000000..95e0d35e --- /dev/null +++ b/docs/LLaMA-model.md @@ -0,0 +1,45 @@ +LLaMA is a Large Language Model developed by Meta AI. + +It was trained on more tokens than previous models. The result is that the smallest version with 7 billion parameters has similar performance to GPT-3 with 175 billion parameters. + +This guide will cover usage through the official `transformers` implementation. For 4-bit mode, head over to [GPTQ models (4 bit mode) +](https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)). + +## Getting the weights + +### Option 1: pre-converted weights + +* Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789 +* Direct download: https://huggingface.co/Neko-Institute-of-Science + +⚠️ The tokenizers for the sources above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, so I recommend downloading the following universal LLaMA tokenizer: + +``` +python download-model.py oobabooga/llama-tokenizer +``` + +Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM` model that you try to load. + +### Option 2: convert the weights yourself + +1. Install the `protobuf` library: + +``` +pip install protobuf +``` + +2. Use the script below to convert the model in `.pth` format that you, a fellow academic, downloaded using Meta's official link: + +### [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) + +``` +python convert_llama_weights_to_hf.py --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b +``` + +3. Move the `llama-7b` folder inside your `text-generation-webui/models` folder. + +## Starting the web UI + +```python +python server.py --model llama-7b +``` \ No newline at end of file diff --git a/docs/Low-VRAM-guide.md b/docs/Low-VRAM-guide.md new file mode 100644 index 00000000..d504d4e7 --- /dev/null +++ b/docs/Low-VRAM-guide.md @@ -0,0 +1,51 @@ +If you GPU is not large enough to fit a model, try these in the following order: + +### Load the model in 8-bit mode + +``` +python server.py --load-in-8bit +``` + +This reduces the memory usage by half with no noticeable loss in quality. Only newer GPUs support 8-bit mode. + +### Split the model across your GPU and CPU + +``` +python server.py --auto-devices +``` + +If you can load the model with this command but it runs out of memory when you try to generate text, try increasingly limiting the amount of memory allocated to the GPU until the error stops happening: + +``` +python server.py --auto-devices --gpu-memory 10 +python server.py --auto-devices --gpu-memory 9 +python server.py --auto-devices --gpu-memory 8 +... +``` + +where the number is in GiB. + +For finer control, you can also specify the unit in MiB explicitly: + +``` +python server.py --auto-devices --gpu-memory 8722MiB +python server.py --auto-devices --gpu-memory 4725MiB +python server.py --auto-devices --gpu-memory 3500MiB +... +``` + +Additionally, you can also set the `--no-cache` value to reduce the GPU usage while generating text at a performance cost. This may allow you to set a higher value for `--gpu-memory`, resulting in a net performance gain. + +### Send layers to a disk cache + +As a desperate last measure, you can split the model across your GPU, CPU, and disk: + +``` +python server.py --auto-devices --disk +``` + +With this, I am able to load a 30b model into my RTX 3090, but it takes 10 seconds to generate 1 word. + +### DeepSpeed (experimental) + +An experimental alternative to all of the above is to use DeepSpeed: [guide](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed). \ No newline at end of file diff --git a/docs/RWKV-model.md b/docs/RWKV-model.md new file mode 100644 index 00000000..27db3d10 --- /dev/null +++ b/docs/RWKV-model.md @@ -0,0 +1,54 @@ +> RWKV: RNN with Transformer-level LLM Performance +> +> It combines the best of RNN and transformer - great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding (using the final hidden state). + +https://github.com/BlinkDL/RWKV-LM + +https://github.com/BlinkDL/ChatRWKV + +## Using RWKV in the web UI + +#### 1. Download the model + +It is available in different sizes: + +* https://huggingface.co/BlinkDL/rwkv-4-pile-3b/ +* https://huggingface.co/BlinkDL/rwkv-4-pile-7b/ +* https://huggingface.co/BlinkDL/rwkv-4-pile-14b/ + +There are also older releases with smaller sizes like: + +* https://huggingface.co/BlinkDL/rwkv-4-pile-169m/resolve/main/RWKV-4-Pile-169M-20220807-8023.pth + +Download the chosen `.pth` and put it directly in the `models` folder. + +#### 2. Download the tokenizer + +[20B_tokenizer.json](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/v2/20B_tokenizer.json) + +Also put it directly in the `models` folder. Make sure to not rename it. It should be called `20B_tokenizer.json`. + +#### 3. Launch the web UI + +No additional steps are required. Just launch it as you would with any other model. + +``` +python server.py --listen --no-stream --model RWKV-4-Pile-169M-20220807-8023.pth +``` + +## Setting a custom strategy + +It is possible to have very fine control over the offloading and precision for the model with the `--rwkv-strategy` flag. Possible values include: + +``` +"cpu fp32" # CPU mode +"cuda fp16" # GPU mode with float16 precision +"cuda fp16 *30 -> cpu fp32" # GPU+CPU offloading. The higher the number after *, the higher the GPU allocation. +"cuda fp16i8" # GPU mode with 8-bit precision +``` + +See the README for the PyPl package for more details: https://pypi.org/project/rwkv/ + +## Compiling the CUDA kernel + +You can compile the CUDA kernel for the model with `--rwkv-cuda-on`. This should improve the performance a lot but I haven't been able to get it to work yet. \ No newline at end of file diff --git a/docs/Spell-book.md b/docs/Spell-book.md new file mode 100644 index 00000000..1361a612 --- /dev/null +++ b/docs/Spell-book.md @@ -0,0 +1,111 @@ +You have now entered a hidden corner of the internet. + +A confusing yet intriguing realm of paradoxes and contradictions. + +A place where you will find out that what you thought you knew, you in fact didn't know, and what you didn't know was in front of you all along. + +![](https://i.pinimg.com/originals/6e/e2/7b/6ee27bad351d3aca470d80f1033ba9c6.jpg) + +*In other words, here I will document little-known facts about this web UI that I could not find another place for in the wiki.* + +#### You can train LoRAs in CPU mode + +Load the web UI with + +``` +python server.py --cpu +``` + +and start training the LoRA from the training tab as usual. + +#### 8-bit mode works with CPU offloading + +``` +python server.py --load-in-8bit --gpu-memory 4000MiB +``` + +#### `--pre_layer`, and not `--gpu-memory`, is the right way to do CPU offloading with 4-bit models + +``` +python server.py --wbits 4 --groupsize 128 --pre_layer 20 +``` + +#### Models can be loaded in 32-bit, 16-bit, 8-bit, and 4-bit modes + +``` +python server.py --cpu +python server.py +python server.py --load-in-8bit +python server.py --wbits 4 +``` + +#### The web UI works with any version of GPTQ-for-LLaMa + +Including the up to date triton and cuda branches. But you have to delete the `repositories/GPTQ-for-LLaMa` folder and reinstall the new one every time: + +``` +cd text-generation-webui/repositories +rm -r GPTQ-for-LLaMa +pip uninstall quant-cuda +git clone https://github.com/oobabooga/GPTQ-for-LLaMa -b cuda # or any other repository and branch +cd GPTQ-for-LLaMa +python setup_cuda.py install +``` + +#### Instruction-following templates are represented as chat characters + +https://github.com/oobabooga/text-generation-webui/tree/main/characters/instruction-following + +#### The right way to run Alpaca, Open Assistant, Vicuna, etc is Instruct mode, not normal chat mode + +Otherwise the prompt will not be formatted correctly. + +1. Start the web UI with + +``` +python server.py --chat +``` + +2. Click on the "instruct" option under "Chat modes" + +3. Select the correct template in the hidden dropdown menu that will become visible. + +#### Notebook mode is best mode + +Ascended individuals have realized that notebook mode is the superset of chat mode and can do chats with ultimate flexibility, including group chats, editing replies, starting a new bot reply in a given way, and impersonating. + +#### RWKV is a RNN + +Most models are transformers, but not RWKV, which is a RNN. It's a great model. + +#### `--gpu-memory` is not a hard limit on the GPU memory + +It is simply a parameter that is passed to the `accelerate` library while loading the model. More memory will be allocated during generation. That's why this parameter has to be set to less than your total GPU memory. + +#### Contrastive search perhaps the best preset + +But it uses a ton of VRAM. + +#### You can check the sha256sum of downloaded models with the download script + +``` +python download-model.py facebook/galactica-125m --check +``` + +#### The download script continues interrupted downloads by default + +It doesn't start over. + +#### You can download models with multiple threads + +``` +python download-model.py facebook/galactica-125m --threads 8 +``` + +#### LoRAs work in 4-bit mode + +You need to follow these instructions + +https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)#using-loras-in-4-bit-mode + +and then start the web UI with the `--monkey-patch` flag. \ No newline at end of file diff --git a/docs/System-requirements.md b/docs/System-requirements.md new file mode 100644 index 00000000..3a88416d --- /dev/null +++ b/docs/System-requirements.md @@ -0,0 +1,42 @@ +These are the VRAM and RAM requirements (in MiB) to run some examples of models **in 16-bit (default) precision**: + +| model | VRAM (GPU) | RAM | +|:-----------------------|-------------:|--------:| +| arxiv_ai_gpt2 | 1512.37 | 5824.2 | +| blenderbot-1B-distill | 2441.75 | 4425.91 | +| opt-1.3b | 2509.61 | 4427.79 | +| gpt-neo-1.3b | 2605.27 | 5851.58 | +| opt-2.7b | 5058.05 | 4863.95 | +| gpt4chan_model_float16 | 11653.7 | 4437.71 | +| gpt-j-6B | 11653.7 | 5633.79 | +| galactica-6.7b | 12697.9 | 4429.89 | +| opt-6.7b | 12700 | 4368.66 | +| bloomz-7b1-p3 | 13483.1 | 4470.34 | + +#### GPU mode with 8-bit precision + +Allows you to load models that would not normally fit into your GPU. Enabled by default for 13b and 20b models in this web UI. + +| model | VRAM (GPU) | RAM | +|:---------------|-------------:|--------:| +| opt-13b | 12528.1 | 1152.39 | +| gpt-neox-20b | 20384 | 2291.7 | + +#### CPU mode (32-bit precision) + +A lot slower, but does not require a GPU. + +On my i5-12400F, 6B models take around 10-20 seconds to respond in chat mode, and around 5 minutes to generate a 200 tokens completion. + +| model | RAM | +|:-----------------------|---------:| +| arxiv_ai_gpt2 | 4430.82 | +| gpt-neo-1.3b | 6089.31 | +| opt-1.3b | 8411.12 | +| blenderbot-1B-distill | 8508.16 | +| opt-2.7b | 14969.3 | +| bloomz-7b1-p3 | 21371.2 | +| gpt-j-6B | 24200.3 | +| gpt4chan_model | 24246.3 | +| galactica-6.7b | 26561.4 | +| opt-6.7b | 29596.6 | diff --git a/docs/Using-LoRAs.md b/docs/Using-LoRAs.md new file mode 100644 index 00000000..8049e96b --- /dev/null +++ b/docs/Using-LoRAs.md @@ -0,0 +1,88 @@ +Based on https://github.com/tloen/alpaca-lora + +## Instructions + +1. Download a LoRA, for instance: + +``` +python download-model.py tloen/alpaca-lora-7b +``` + +2. Load the LoRA. 16-bit, 8-bit, and CPU modes work: + +``` +python server.py --model llama-7b-hf --lora alpaca-lora-7b +python server.py --model llama-7b-hf --lora alpaca-lora-7b --load-in-8bit +python server.py --model llama-7b-hf --lora alpaca-lora-7b --cpu +``` + +* For using LoRAs in 4-bit mode, follow these special instructions: https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)#using-loras-in-4-bit-mode + +* Instead of using the `--lora` command-line flag, you can also select the LoRA in the "Parameters" tab of the interface. + +## Prompt +For the Alpaca LoRA in particular, the prompt must be formatted like this: + +``` +Below is an instruction that describes a task. Write a response that appropriately completes the request. +### Instruction: +Write a Python script that generates text using the transformers library. +### Response: +``` + +Sample output: + +``` +Below is an instruction that describes a task. Write a response that appropriately completes the request. +### Instruction: +Write a Python script that generates text using the transformers library. +### Response: + +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +model = AutoModelForCausalLM.from_pretrained("bert-base-uncased") +texts = ["Hello world", "How are you"] +for sentence in texts: +sentence = tokenizer(sentence) +print(f"Generated {len(sentence)} tokens from '{sentence}'") +output = model(sentences=sentence).predict() +print(f"Predicted {len(output)} tokens for '{sentence}':\n{output}") +``` + +## Training a LoRA + +The Training tab in the interface can be used to train a LoRA. The parameters are self-documenting and good defaults are included. + +This was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570). + + +#### Using the original alpaca-lora code + +Kept here for reference. The Training tab has much more features than this method. + +``` +conda activate textgen +git clone https://github.com/tloen/alpaca-lora +``` + +Edit those two lines in `alpaca-lora/finetune.py` to use your existing model folder instead of downloading everything from decapoda: + +``` +model = LlamaForCausalLM.from_pretrained( + "models/llama-7b", + load_in_8bit=True, + device_map="auto", +) +tokenizer = LlamaTokenizer.from_pretrained( + "models/llama-7b", add_eos_token=True +) +``` + +Run the script with: + +``` +python finetune.py +``` + +It just works. It runs at 22.32s/it, with 1170 iterations in total, so about 7 hours and a half for training a LoRA. RTX 3090, 18153MiB VRAM used, drawing maximum power (350W, room heater mode). \ No newline at end of file diff --git a/docs/WSL-installation-guide.md b/docs/WSL-installation-guide.md new file mode 100644 index 00000000..eb06123c --- /dev/null +++ b/docs/WSL-installation-guide.md @@ -0,0 +1,73 @@ +Guide created by [@jfryton](https://github.com/jfryton). Thank you jfryton. + +----- + +Here's an easy-to-follow, step-by-step guide for installing Windows Subsystem for Linux (WSL) with Ubuntu on Windows 10/11: + +## Step 1: Enable WSL + +1. Press the Windows key + X and click on "Windows PowerShell (Admin)" or "Windows Terminal (Admin)" to open PowerShell or Terminal with administrator privileges. +2. In the PowerShell window, type the following command and press Enter: + +``` +wsl --install +``` + +If this command doesn't work, you can enable WSL with the following command for Windows 10: + +``` +wsl --set-default-version 1 +``` + +For Windows 11, you can use: + +``` +wsl --set-default-version 2 +``` + +You may be prompted to restart your computer. If so, save your work and restart. + +## Step 2: Install Ubuntu + +1. Open the Microsoft Store. +2. Search for "Ubuntu" in the search bar. +3. Choose the desired Ubuntu version (e.g., Ubuntu 20.04 LTS) and click "Get" or "Install" to download and install the Ubuntu app. +4. Once the installation is complete, click "Launch" or search for "Ubuntu" in the Start menu and open the app. + +## Step 3: Set up Ubuntu + +1. When you first launch the Ubuntu app, it will take a few minutes to set up. Be patient as it installs the necessary files and sets up your environment. +2. Once the setup is complete, you will be prompted to create a new UNIX username and password. Choose a username and password, and make sure to remember them, as you will need them for future administrative tasks within the Ubuntu environment. + +## Step 4: Update and upgrade packages + +1. After setting up your username and password, it's a good idea to update and upgrade your Ubuntu system. Run the following commands in the Ubuntu terminal: + +``` +sudo apt update +sudo apt upgrade +``` + +2. Enter your password when prompted. This will update the package list and upgrade any outdated packages. + +Congratulations! You have now installed WSL with Ubuntu on your Windows 10/11 system. You can use the Ubuntu terminal for various tasks, like running Linux commands, installing packages, or managing files. + +You can launch your WSL Ubuntu installation by selecting the Ubuntu app (like any other program installed on your computer) or typing 'ubuntu' into Powershell or Terminal. + +## Step 5: Proceed with Linux instructions + +1. You can now follow the Linux setup instructions. If you receive any error messages about a missing tool or package, just install them using apt: + +``` +sudo apt install [missing package] +``` + +If you face any issues or need to troubleshoot, you can always refer to the official Microsoft documentation for WSL: https://docs.microsoft.com/en-us/windows/wsl/ + +## Bonus: Port Forwarding + +By default, you won't be able to access the webui from another device on your local network. You will need to setup the appropriate port forwarding using the following command (using PowerShell or Terminal with administrator privileges). + +``` +netsh interface portproxy add v4tov4 listenaddress=0.0.0.0 listenport=7860 connectaddress=localhost connectport=7860 +``` \ No newline at end of file diff --git a/docs/Windows-installation-guide.md b/docs/Windows-installation-guide.md new file mode 100644 index 00000000..83b22efa --- /dev/null +++ b/docs/Windows-installation-guide.md @@ -0,0 +1,9 @@ +If you are having trouble following the installation instructions in the README, Reddit user [Technical_Leather949](https://www.reddit.com/user/Technical_Leather949/) has created a more detailed, step-by-step guide covering: + +* Windows installation +* 8-bit mode on Windows +* LLaMA +* LLaMA 4-bit + +The guide can be found here: https://www.reddit.com/r/LocalLLaMA/comments/11o6o3f/how_to_install_llama_8bit_and_4bit/ + diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md new file mode 100644 index 00000000..7c1553a2 --- /dev/null +++ b/docs/llama.cpp-models.md @@ -0,0 +1,35 @@ +## Using llama.cpp in the web UI + +1. Re-install the requirements.txt: + +``` +pip install -r requirements.txt -U +``` + +2. Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage + +3. Create a folder inside `models/` for your model and put `ggml-model-q4_0.bin` in it. For instance, `models/llamacpp-7b/ggml-model-q4_0.bin`. + +4. Start the web UI normally: + +``` +python server.py --model llamacpp-7b +``` + +* This procedure should work for any `ggml*.bin` file. Just put it in a folder, and use the name of this folder as the argument after `--model` or as the model loaded inside the interface. +* You can change the number of threads with `--threads N`. + +## Performance + +This was the performance of llama-7b int4 on my i5-12400F: + +> Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17) + +## Limitations + +~* The parameter sliders in the interface (temperature, top_p, top_k, etc) are completely ignored. So only the default parameters in llama.cpp can be used.~ + +~* Only 512 tokens of context can be used.~ + +~Both of these should be improved soon when llamacpp-python receives an update.~ + From fe6e9ea9861550a3a2b995010a681b47f51457ad Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:40:08 -0300 Subject: [PATCH 15/33] Update README.md --- docs/README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index e7123041..169e6d43 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1 +1,18 @@ -wip +# text-generation-webui manual + +## Table of contents + +* [Custom-chat-characters](Custom-chat-characters.md) +* [DeepSpeed](DeepSpeed.md) +* [Extensions](Extensions.md) +* [FlexGen](FlexGen.md) +* [GPTQ-models-(4-bit-mode)](GPTQ-models-(4-bit-mode).md) +* [llama.cpp-models](llama.cpp-models.md) +* [LLaMA-model](LLaMA-model.md) +* [Low-VRAM-guide](Low-VRAM-guide.md) +* [RWKV-model](RWKV-model.md) +* [Spell-book](Spell-book.md) +* [System-requirements](System-requirements.md) +* [Using-LoRAs](Using-LoRAs.md) +* [Windows-installation-guide](Windows-installation-guide.md) +* [WSL-installation-guide](WSL-installation-guide.md) From b5e5b9aeaeaffa1d471e67ea3b8892cf29c4ca19 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:40:20 -0300 Subject: [PATCH 16/33] Delete Home.md --- docs/Home.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 docs/Home.md diff --git a/docs/Home.md b/docs/Home.md deleted file mode 100644 index 8448d13c..00000000 --- a/docs/Home.md +++ /dev/null @@ -1 +0,0 @@ -Welcome to the text-generation-webui wiki! From 038fa3eb3998874e3941522abb378c25f91e3407 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:46:07 -0300 Subject: [PATCH 17/33] Update README.md --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index eaadebda..f3a18a5e 100644 --- a/README.md +++ b/README.md @@ -16,23 +16,23 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. * Instruct mode compatible with Alpaca, Vicuna, Open Assistant, Dolly, Koala, and ChatGLM formats * Nice HTML output for GPT-4chan * Markdown output for [GALACTICA](https://github.com/paperswithcode/galai), including LaTeX rendering -* [Custom chat characters](https://github.com/oobabooga/text-generation-webui/wiki/Custom-chat-characters) +* [Custom chat characters](docs/Custom-chat-characters.md) * Advanced chat features (send images, get audio responses with TTS) * Very efficient text streaming * Parameter presets * 8-bit mode * Layers splitting across GPU(s), CPU, and disk * CPU mode -* [FlexGen](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen) -* [DeepSpeed ZeRO-3](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed) +* [FlexGen](docs/FlexGen.md) +* [DeepSpeed ZeRO-3](docs/DeepSpeed.md) * API [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-stream.py) streaming and [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming -* [LLaMA model](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model) -* [4-bit GPTQ mode](https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)) -* [llama.cpp](https://github.com/oobabooga/text-generation-webui/wiki/llama.cpp-models) -* [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model) -* [LoRA (loading and training)](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs) +* [LLaMA model](docs/LLaMA-model.md) +* [4-bit GPTQ mode](docs/GPTQ-models-(4-bit-mode).md) +* [llama.cpp](docs/llama.cpp-models.md) +* [RWKV model](docs/RWKV-model.md) +* [LoRA (loading and training)](docs/Using-LoRAs.md) * Softprompts -* [Extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions) - see the [user extensions list](https://github.com/oobabooga/text-generation-webui-extensions) +* [Extensions](docs/Extensions.md) - see the [user extensions list](https://github.com/oobabooga/text-generation-webui-extensions) ## Installation @@ -52,7 +52,7 @@ Just download the zip above, extract it, and double click on "start". The web UI Recommended if you have some experience with the command-line. -On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/WSL-installation-guide). +On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/blob/main/docs/WSL-installation-guide.md). #### 0. Install Conda @@ -105,7 +105,7 @@ pip install -r requirements.txt ### Alternative: manual Windows installation -As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-installation-guide). +As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Windows-installation-guide.md). ### Alternative: Docker @@ -269,7 +269,7 @@ Optionally, you can use the following command-line flags: | `--auto-launch` | Open the web UI in the default browser upon launch. | | `--gradio-auth-path GRADIO_AUTH_PATH` | Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3" | -Out of memory errors? [Check the low VRAM guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide). +Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md). ## Presets @@ -281,7 +281,7 @@ By default, 10 presets by NovelAI and KoboldAI are included. These were selected ## System requirements -Check the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/System-requirements) for some examples of VRAM and RAM usage in both GPU and CPU mode. +Check the [wiki](docs/System-requirements.md) for some examples of VRAM and RAM usage in both GPU and CPU mode. ## Contributing From f5c36cca4076b0b3762ebc7e8c749305dc66db2c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:49:54 -0300 Subject: [PATCH 18/33] Update LLaMA-model.md --- docs/LLaMA-model.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md index 95e0d35e..299076b2 100644 --- a/docs/LLaMA-model.md +++ b/docs/LLaMA-model.md @@ -3,7 +3,7 @@ LLaMA is a Large Language Model developed by Meta AI. It was trained on more tokens than previous models. The result is that the smallest version with 7 billion parameters has similar performance to GPT-3 with 175 billion parameters. This guide will cover usage through the official `transformers` implementation. For 4-bit mode, head over to [GPTQ models (4 bit mode) -](https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)). +](GPTQ-models-(4-bit-mode).md). ## Getting the weights @@ -42,4 +42,4 @@ python convert_llama_weights_to_hf.py --input_dir /path/to/LLaMA --model_size 7B ```python python server.py --model llama-7b -``` \ No newline at end of file +``` From 6d4f131d0ae62d9e3bf342c7edaeab307c3a9f52 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:50:35 -0300 Subject: [PATCH 19/33] Update Low-VRAM-guide.md --- docs/Low-VRAM-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Low-VRAM-guide.md b/docs/Low-VRAM-guide.md index d504d4e7..1dc86f9c 100644 --- a/docs/Low-VRAM-guide.md +++ b/docs/Low-VRAM-guide.md @@ -48,4 +48,4 @@ With this, I am able to load a 30b model into my RTX 3090, but it takes 10 secon ### DeepSpeed (experimental) -An experimental alternative to all of the above is to use DeepSpeed: [guide](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed). \ No newline at end of file +An experimental alternative to all of the above is to use DeepSpeed: [guide](DeepSpeed.md). From 9508f207ba9f30be67a3832843490ef1c87db2fe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:53:01 -0300 Subject: [PATCH 20/33] Update Using-LoRAs.md --- docs/Using-LoRAs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Using-LoRAs.md b/docs/Using-LoRAs.md index 8049e96b..159345ef 100644 --- a/docs/Using-LoRAs.md +++ b/docs/Using-LoRAs.md @@ -16,7 +16,7 @@ python server.py --model llama-7b-hf --lora alpaca-lora-7b --load-in-8bit python server.py --model llama-7b-hf --lora alpaca-lora-7b --cpu ``` -* For using LoRAs in 4-bit mode, follow these special instructions: https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)#using-loras-in-4-bit-mode +* For using LoRAs in 4-bit mode, follow [these special instructions](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode). * Instead of using the `--lora` command-line flag, you can also select the LoRA in the "Parameters" tab of the interface. @@ -85,4 +85,4 @@ Run the script with: python finetune.py ``` -It just works. It runs at 22.32s/it, with 1170 iterations in total, so about 7 hours and a half for training a LoRA. RTX 3090, 18153MiB VRAM used, drawing maximum power (350W, room heater mode). \ No newline at end of file +It just works. It runs at 22.32s/it, with 1170 iterations in total, so about 7 hours and a half for training a LoRA. RTX 3090, 18153MiB VRAM used, drawing maximum power (350W, room heater mode). From 4d9ae44efd52e7d808a50ea2a01c937ebd03e148 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:53:52 -0300 Subject: [PATCH 21/33] Update Spell-book.md --- docs/Spell-book.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/Spell-book.md b/docs/Spell-book.md index 1361a612..9b7c76c9 100644 --- a/docs/Spell-book.md +++ b/docs/Spell-book.md @@ -104,8 +104,4 @@ python download-model.py facebook/galactica-125m --threads 8 #### LoRAs work in 4-bit mode -You need to follow these instructions - -https://github.com/oobabooga/text-generation-webui/wiki/GPTQ-models-(4-bit-mode)#using-loras-in-4-bit-mode - -and then start the web UI with the `--monkey-patch` flag. \ No newline at end of file +You need to follow [these instructions](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) and then start the web UI with the `--monkey-patch` flag. From 408e172ad9fa81102c50d4eabdf826f31a8976fc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 03:03:05 -0300 Subject: [PATCH 22/33] Rename docker/README.md to docs/Docker.md --- docker/README.md => docs/Docker.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docker/README.md => docs/Docker.md (100%) diff --git a/docker/README.md b/docs/Docker.md similarity index 100% rename from docker/README.md rename to docs/Docker.md From ef40b4e862528bb2002e8e65c6f45e5068eff980 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 03:03:39 -0300 Subject: [PATCH 23/33] Update README.md --- docs/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/README.md b/docs/README.md index 169e6d43..f43efa68 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,6 +3,7 @@ ## Table of contents * [Custom-chat-characters](Custom-chat-characters.md) +* [Docker Compose](Docker.md) * [DeepSpeed](DeepSpeed.md) * [Extensions](Extensions.md) * [FlexGen](FlexGen.md) From fe02281477d58bc4212c2aa5530c887b52eff926 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 03:05:00 -0300 Subject: [PATCH 24/33] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f3a18a5e..bb54d586 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ cp docker/.env.example .env docker compose up --build ``` -You need to have docker compose v2.17 or higher installed in your system. To see how to install docker compose itself, see the guide in https://github.com/oobabooga/text-generation-webui/tree/main/docker. +You need to have docker compose v2.17 or higher installed in your system. To see how to install docker compose itself, see the guide in [here](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Docker.md). Contributed by [@loeken](https://github.com/loeken) in [#633](https://github.com/oobabooga/text-generation-webui/pull/633) From e03b87346022f2fdbbb6687048d98539296e7edc Mon Sep 17 00:00:00 2001 From: InconsolableCellist <23345188+InconsolableCellist@users.noreply.github.com> Date: Sat, 22 Apr 2023 00:35:36 -0600 Subject: [PATCH 25/33] Updating Using-LoRAs.md doc to clarify resuming training (#1474) --- docs/Using-LoRAs.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/Using-LoRAs.md b/docs/Using-LoRAs.md index 159345ef..de271e3d 100644 --- a/docs/Using-LoRAs.md +++ b/docs/Using-LoRAs.md @@ -54,8 +54,9 @@ print(f"Predicted {len(output)} tokens for '{sentence}':\n{output}") The Training tab in the interface can be used to train a LoRA. The parameters are self-documenting and good defaults are included. -This was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570). +You can interrupt and resume LoRA training in this tab. If the name and rank are the same, training will resume using the `adapter_model.bin` in your LoRA folder. You can resume from a past checkpoint by replacing this file using the contents of one of the checkpoint folders. Note that the learning rate and steps will be reset, and you may want to set the learning rate to the last reported rate in the console output. +LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570). #### Using the original alpaca-lora code From 7438f4f6ba28f36d59da5132a5bcf5edb10742d4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:27:30 -0300 Subject: [PATCH 26/33] Change GPTQ triton default settings --- README.md | 6 +++--- modules/GPTQ_loader.py | 8 ++++---- modules/shared.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bb54d586..681180ba 100644 --- a/README.md +++ b/README.md @@ -230,9 +230,9 @@ Optionally, you can use the following command-line flags: | `--groupsize GROUPSIZE` | Group size. | | `--pre_layer PRE_LAYER` | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. | | `--monkey-patch` | Apply the monkey patch for using LoRAs with quantized models. -| `--no-quant_attn` | (triton) Disable quant attention. If you encounter incoherent results try disabling this. -| `--no-warmup_autotune` | (triton) Disable warmup autotune. -| `--no-fused_mlp` | (triton) Disable fused mlp. If you encounter "Unexpected mma -> mma layout conversion" try disabling this. +| `--quant_attn` | (triton) Enable quant attention. +| `--warmup_autotune` | (triton) Enable warmup autotune. +| `--fused_mlp` | (triton) Enable fused mlp. #### FlexGen diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 3379d27a..a42dbcf3 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -79,14 +79,14 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc model.load_state_dict(torch.load(checkpoint), strict=False) if is_triton: - if not shared.args.no_quant_attn: + if shared.args.quant_attn: quant.make_quant_attn(model) - if eval and not shared.args.no_fused_mlp: + if eval and shared.args.fused_mlp: quant.make_fused_mlp(model) - if not shared.args.no_warmup_autotune: + if shared.args.warmup_autotune: quant.autotune_warmup_linear(model, transpose=not eval) - if eval and not shared.args.no_fused_mlp: + if eval and shared.args.fused_mlp: quant.autotune_warmup_fused(model) model.seqlen = 2048 diff --git a/modules/shared.py b/modules/shared.py index a08f134f..41c068db 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -124,9 +124,9 @@ parser.add_argument('--model_type', type=str, help='Model type of pre-quantized parser.add_argument('--groupsize', type=int, default=-1, help='Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.') parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.') -parser.add_argument('--no-quant_attn', action='store_true', help='(triton) Disable quant attention. If you encounter incoherent results try disabling this.') -parser.add_argument('--no-warmup_autotune', action='store_true', help='(triton) Disable warmup autotune.') -parser.add_argument('--no-fused_mlp', action='store_true', help='(triton) Disable fused mlp. If you encounter "Unexpected mma -> mma layout conversion" try disabling this.') +parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable quant attention.') +parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.') +parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.') # FlexGen parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.') From 2c6d43e60f6c31aec3963047468d6b51d593bf7f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:48:20 -0300 Subject: [PATCH 27/33] Update GPTQ-models-(4-bit-mode).md --- docs/GPTQ-models-(4-bit-mode).md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md index 9ed7cc37..12f02a0c 100644 --- a/docs/GPTQ-models-(4-bit-mode).md +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -4,6 +4,18 @@ This is possible thanks to [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-fo GPTQ is a clever quantization algorithm that lightly reoptimizes the weights during quantization so that the accuracy loss is compensated relative to a round-to-nearest quantization. See the paper for more details: https://arxiv.org/abs/2210.17323 +## GPTQ-for-LLaMa branches + +Different branches of GPTQ-for-LLaMa are available: + +| Branch | Comment | +|----|----| +| [Old CUDA branch (recommended)](https://github.com/oobabooga/GPTQ-for-LLaMa/) | The fastest branch, works on Windows and Linux. | +| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch, 2x slower for small context size, only works on Linux. | +| [Up-to-date CUDA branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda) | As precise as the up-to-date triton branch, 10x slower than the old cuda branch for small context size. | + +Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI. + ## Installation ### Step 0: install nvcc @@ -19,7 +31,7 @@ See this issue for more details: https://github.com/oobabooga/text-generation-we ### Step 1: install GPTQ-for-LLaMa -Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it: +* Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it: ``` mkdir repositories @@ -31,7 +43,7 @@ python setup_cuda.py install You are going to need to have a C++ compiler installed into your system for the last command. On Linux, `sudo apt install build-essential` or equivalent is enough. -https://github.com/oobabooga/GPTQ-for-LLaMa corresponds to commit `a6f363e3f93b9fb5c26064b5ac7ed58d22e3f773` in the `cuda` branch of the original repository and is recommended by default for stability. Some models might require you to use the up-to-date CUDA or triton branches: +If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands: ``` cd repositories @@ -57,7 +69,7 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa * Converted without `group-size` (better for the 7b model): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617 * Converted with `group-size` (better from 13b upwards): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105 -Note: the tokenizer files in those torrents are not up to date. +⚠️ The tokenizer files in the sources above may be outdated. Make sure to obtain the universal LLaMA tokenizer as described [here](https://github.com/oobabooga/text-generation-webui/blob/main/docs/LLaMA-model.md#option-1-pre-converted-weights). ### Step 3: Start the web UI: From 06b6ff6c2e1150870156a1f3c88ffcd0b21ca668 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:49:00 -0300 Subject: [PATCH 28/33] Update GPTQ-models-(4-bit-mode).md --- docs/GPTQ-models-(4-bit-mode).md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md index 12f02a0c..f8429e6a 100644 --- a/docs/GPTQ-models-(4-bit-mode).md +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -31,7 +31,7 @@ See this issue for more details: https://github.com/oobabooga/text-generation-we ### Step 1: install GPTQ-for-LLaMa -* Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it: +Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it: ``` mkdir repositories From fcb594b90e86d7d22db2de7a636f7a3c1491226e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 14:56:48 -0300 Subject: [PATCH 29/33] Don't require llama.cpp models to be placed in subfolders --- docs/llama.cpp-models.md | 28 +++++---------------------- modules/models.py | 39 +++++++++++++++++++++++++++++--------- modules/shared.py | 3 +-- modules/text_generation.py | 10 +++++----- 4 files changed, 41 insertions(+), 39 deletions(-) diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md index 7c1553a2..57fbf613 100644 --- a/docs/llama.cpp-models.md +++ b/docs/llama.cpp-models.md @@ -1,23 +1,12 @@ ## Using llama.cpp in the web UI -1. Re-install the requirements.txt: +#### Pre-converted models -``` -pip install -r requirements.txt -U -``` +Simply place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`. -2. Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage +#### Convert LLaMA yourself -3. Create a folder inside `models/` for your model and put `ggml-model-q4_0.bin` in it. For instance, `models/llamacpp-7b/ggml-model-q4_0.bin`. - -4. Start the web UI normally: - -``` -python server.py --model llamacpp-7b -``` - -* This procedure should work for any `ggml*.bin` file. Just put it in a folder, and use the name of this folder as the argument after `--model` or as the model loaded inside the interface. -* You can change the number of threads with `--threads N`. +Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage ## Performance @@ -25,11 +14,4 @@ This was the performance of llama-7b int4 on my i5-12400F: > Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17) -## Limitations - -~* The parameter sliders in the interface (temperature, top_p, top_k, etc) are completely ignored. So only the default parameters in llama.cpp can be used.~ - -~* Only 512 tokens of context can be used.~ - -~Both of these should be improved soon when llamacpp-python receives an update.~ - +You can change the number of threads with `--threads N`. diff --git a/modules/models.py b/modules/models.py index 800d0be2..ca014d79 100644 --- a/modules/models.py +++ b/modules/models.py @@ -38,13 +38,30 @@ if shared.args.deepspeed: dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration +def find_model_type(model_name): + model_name = model_name.lower() + if 'rwkv-' in model_name.lower(): + return 'rwkv' + elif len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))) > 0: + return 'llamacpp' + elif re.match('.*ggml.*\.bin', model_name): + return 'llamacpp' + elif 'chatglm' in model_name: + return 'chatglm' + elif 'galactica' in model_name: + return 'galactica' + elif any((k in model_name for k in ['gpt4chan', 'gpt-4chan'])): + return 'gpt4chan' + else: + return 'HF_generic' + + def load_model(model_name): print(f"Loading {model_name}...") t0 = time.time() - shared.is_RWKV = 'rwkv-' in model_name.lower() - shared.is_llamacpp = len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))) > 0 - if 'chatglm' in model_name.lower(): + shared.model_type = find_model_type(model_name) + if shared.model_type == 'chatglm': LoaderClass = AutoModel trust_remote_code = shared.args.trust_remote_code else: @@ -52,7 +69,7 @@ def load_model(model_name): trust_remote_code = False # Load the model in simple 16-bit mode by default - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.model_type in ['rwkv', 'llamacpp']]): model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code) if torch.has_mps: device = torch.device('mps') @@ -91,7 +108,7 @@ def load_model(model_name): print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}") # RMKV model (not on HuggingFace) - elif shared.is_RWKV: + elif shared.model_type == 'rwkv': from modules.RWKV import RWKVModel, RWKVTokenizer model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda") @@ -100,12 +117,16 @@ def load_model(model_name): return model, tokenizer # llamacpp model - elif shared.is_llamacpp: + elif shared.model_type == 'llamacpp': from modules.llamacpp_model_alternative import LlamaCppModel - model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0] - print(f"llama.cpp weights detected: {model_file}\n") + path = Path(f'{shared.args.model_dir}/{model_name}') + if path.is_file(): + model_file = path + else: + model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0] + print(f"llama.cpp weights detected: {model_file}\n") model, tokenizer = LlamaCppModel.from_pretrained(model_file) return model, tokenizer @@ -190,7 +211,7 @@ def load_model(model_name): llama_attn_hijack.hijack_llama_attention() # Loading the tokenizer - if any((k in model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): + if shared.model_type == 'gpt4chan' and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif type(model) is transformers.LlamaForCausalLM: tokenizer = None diff --git a/modules/shared.py b/modules/shared.py index 41c068db..1517526a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -6,11 +6,10 @@ import yaml model = None tokenizer = None model_name = "None" +model_type = None lora_names = [] soft_prompt_tensor = None soft_prompt = False -is_RWKV = False -is_llamacpp = False # Chat variables history = {'internal': [], 'visible': []} diff --git a/modules/text_generation.py b/modules/text_generation.py index 370130ed..e1e169a0 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -24,7 +24,7 @@ def get_max_prompt_length(state): def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None): - if any((shared.is_RWKV, shared.is_llamacpp)): + if shared.model_type in ['rwkv', 'llamacpp']: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -44,7 +44,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if any((shared.is_RWKV, shared.is_llamacpp, shared.args.cpu)): + if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu: return input_ids elif shared.args.flexgen: return input_ids.numpy() @@ -97,10 +97,10 @@ def fix_galactica(s): def formatted_outputs(reply, model_name): if not shared.is_chat(): - if 'galactica' in model_name.lower(): + if shared.model_type == 'galactica': reply = fix_galactica(reply) return reply, reply, generate_basic_html(reply) - elif any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])): + elif shared.model_type == 'gpt4chan': reply = fix_gpt4chan(reply) return reply, 'Only applicable for GALACTICA models.', generate_4chan_html(reply) else: @@ -142,7 +142,7 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]): # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier - if any((shared.is_RWKV, shared.is_llamacpp)): + if shared.model_type in ['rwkv', 'llamacpp']: if shared.args.verbose: print(f'\n\n{question}\n--------------------\n') From 47666c4d00275a09962d6177fed3a6a3a2be11fe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 15:12:14 -0300 Subject: [PATCH 30/33] Update GPTQ-models-(4-bit-mode).md --- docs/GPTQ-models-(4-bit-mode).md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md index f8429e6a..679cabee 100644 --- a/docs/GPTQ-models-(4-bit-mode).md +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -11,7 +11,7 @@ Different branches of GPTQ-for-LLaMa are available: | Branch | Comment | |----|----| | [Old CUDA branch (recommended)](https://github.com/oobabooga/GPTQ-for-LLaMa/) | The fastest branch, works on Windows and Linux. | -| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch, 2x slower for small context size, only works on Linux. | +| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch from 13b upwards, significantly more precise for 7b. 2x slower for small context size and only works on Linux. | | [Up-to-date CUDA branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda) | As precise as the up-to-date triton branch, 10x slower than the old cuda branch for small context size. | Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI. From c0b5c09860923c58660ee9018fa1a5971ff60b76 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 15:15:31 -0300 Subject: [PATCH 31/33] Minor change --- modules/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index ca014d79..469cbaf7 100644 --- a/modules/models.py +++ b/modules/models.py @@ -40,7 +40,7 @@ if shared.args.deepspeed: def find_model_type(model_name): model_name = model_name.lower() - if 'rwkv-' in model_name.lower(): + if 'rwkv-' in model_name: return 'rwkv' elif len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))) > 0: return 'llamacpp' From b992c9236a3f0882989b0741775b9047f79e7475 Mon Sep 17 00:00:00 2001 From: AICatgirls <130926942+AICatgirls@users.noreply.github.com> Date: Sat, 22 Apr 2023 12:06:43 -0700 Subject: [PATCH 32/33] Prevent API extension responses from getting cut off with --chat enabled (#1467) --- extensions/api/script.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extensions/api/script.py b/extensions/api/script.py index 5e429130..64e58aab 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -73,10 +73,11 @@ class Handler(BaseHTTPRequestHandler): response = json.dumps({ 'results': [{ - 'text': answer[len(prompt):] + 'text': answer if shared.args.is_chat() else answer[len(prompt):] }] }) self.wfile.write(response.encode('utf-8')) + elif self.path == '/api/v1/token-count': # Not compatible with KoboldAI api self.send_response(200) @@ -90,6 +91,7 @@ class Handler(BaseHTTPRequestHandler): }] }) self.wfile.write(response.encode('utf-8')) + else: self.send_error(404) From 7ff645899e4610b16574bdd22a4d154c93d5b830 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 22 Apr 2023 17:33:36 -0300 Subject: [PATCH 33/33] Fix bug in api extension --- extensions/api/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/api/script.py b/extensions/api/script.py index 64e58aab..e4c3a556 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -73,7 +73,7 @@ class Handler(BaseHTTPRequestHandler): response = json.dumps({ 'results': [{ - 'text': answer if shared.args.is_chat() else answer[len(prompt):] + 'text': answer if shared.is_chat() else answer[len(prompt):] }] }) self.wfile.write(response.encode('utf-8'))