From bd27353a08df9896ef6b560cde7e3dcb33b7d67d Mon Sep 17 00:00:00 2001 From: Maya <48323879+mayaeary@users.noreply.github.com> Date: Sun, 19 Mar 2023 12:51:27 +0000 Subject: [PATCH 01/42] Fix duplicating server on ui reload --- extensions/api/script.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/extensions/api/script.py b/extensions/api/script.py index 53e47f3f..1c57c72a 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -8,6 +8,8 @@ params = { 'port': 5000, } +server = None + class Handler(BaseHTTPRequestHandler): def do_GET(self): if self.path == '/api/v1/model': @@ -73,6 +75,7 @@ class Handler(BaseHTTPRequestHandler): def run_server(): + global server server_addr = ('0.0.0.0' if shared.args.listen else '127.0.0.1', params['port']) server = ThreadingHTTPServer(server_addr, Handler) if shared.args.share: @@ -87,4 +90,5 @@ def run_server(): server.serve_forever() def ui(): - Thread(target=run_server, daemon=True).start() \ No newline at end of file + if server is None: + Thread(target=run_server, daemon=True).start() \ No newline at end of file From 099d7a844b67caa4d33ad272a03d2ab9e18c2a0b Mon Sep 17 00:00:00 2001 From: Maya <48323879+mayaeary@users.noreply.github.com> Date: Sun, 19 Mar 2023 13:22:24 +0000 Subject: [PATCH 02/42] Add setup method to extensions --- extensions/api/script.py | 8 ++------ modules/extensions.py | 8 ++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/extensions/api/script.py b/extensions/api/script.py index 1c57c72a..bbd8551a 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -8,8 +8,6 @@ params = { 'port': 5000, } -server = None - class Handler(BaseHTTPRequestHandler): def do_GET(self): if self.path == '/api/v1/model': @@ -75,7 +73,6 @@ class Handler(BaseHTTPRequestHandler): def run_server(): - global server server_addr = ('0.0.0.0' if shared.args.listen else '127.0.0.1', params['port']) server = ThreadingHTTPServer(server_addr, Handler) if shared.args.share: @@ -89,6 +86,5 @@ def run_server(): print(f'Starting KoboldAI compatible api at http://{server_addr[0]}:{server_addr[1]}/api') server.serve_forever() -def ui(): - if server is None: - Thread(target=run_server, daemon=True).start() \ No newline at end of file +def setup(): + Thread(target=run_server, daemon=True).start() \ No newline at end of file diff --git a/modules/extensions.py b/modules/extensions.py index 836fbc60..9f11c882 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -5,6 +5,7 @@ import modules.shared as shared state = {} available_extensions = [] +setup_called = False def load_extensions(): global state @@ -44,6 +45,13 @@ def create_extensions_block(): if _id in shared.settings: extension.params[param] = shared.settings[_id] + # Running setup function + if not setup_called: + for extension, name in iterator(): + if hasattr(extension, "setup"): + extension.setup() + setup_called = True + # Creating the extension ui elements if len(state) > 0: with gr.Box(elem_id="extensions"): From 81c9d130f2a02410ce6da17806564677c09ab844 Mon Sep 17 00:00:00 2001 From: Maya <48323879+mayaeary@users.noreply.github.com> Date: Sun, 19 Mar 2023 13:25:49 +0000 Subject: [PATCH 03/42] Fix global --- modules/extensions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/extensions.py b/modules/extensions.py index 9f11c882..defc0d66 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -37,6 +37,7 @@ def apply_extensions(text, typ): return text def create_extensions_block(): + global setup_called # Updating the default values for extension, name in iterator(): if hasattr(extension, 'params'): From acdbd6b708b6c6e91b601a28cdfdb8d86a3cc395 Mon Sep 17 00:00:00 2001 From: Maya <48323879+mayaeary@users.noreply.github.com> Date: Sun, 19 Mar 2023 13:31:21 +0000 Subject: [PATCH 04/42] Check if app should display extensions ui --- modules/extensions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/extensions.py b/modules/extensions.py index defc0d66..b363bc39 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -46,15 +46,18 @@ def create_extensions_block(): if _id in shared.settings: extension.params[param] = shared.settings[_id] + should_display_ui = False # Running setup function if not setup_called: for extension, name in iterator(): if hasattr(extension, "setup"): extension.setup() + if hasattr(extension, "ui"): + should_display_ui = True setup_called = True # Creating the extension ui elements - if len(state) > 0: + if should_display_ui: with gr.Box(elem_id="extensions"): gr.Markdown("Extensions") for extension, name in iterator(): From ca47e016b4f1651824ad5631a4dcf05ed0f5de4c Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 20 Mar 2023 12:55:57 -0400 Subject: [PATCH 05/42] Do not display empty user messages in chat mode. There doesn't seem to be much value to them - they just take up space while also making it seem like there's still some sort of pseudo-dialogue going on, instead of a monologue by the bot. --- modules/html_generator.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index 940d5486..f8cff6d8 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -142,22 +142,23 @@ def generate_chat_html(history, name1, name2, character): """ - if not (i == len(history)-1 and len(row[0]) == 0): - output += f""" -
-
- {img_me} -
-
-
- {name1} -
-
- {row[0]} -
-
+ if not row[0]: # don't display empty user messages + continue + output += f""" +
+
+ {img_me} +
+
+
+ {name1}
- """ +
+ {row[0]} +
+
+
+ """ output += "
" return output From e96687b1d619bc3bb149cca0011adba01274963a Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 20 Mar 2023 14:16:48 -0400 Subject: [PATCH 06/42] Do not send empty user input as part of the prompt. However, if extensions modify the empty prompt to be non-empty, it'l still work as before. --- modules/chat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 36265990..c1e55ac4 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -33,12 +33,14 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat i = len(shared.history['internal'])-1 while i >= 0 and len(encode(''.join(rows), max_new_tokens)[0]) < max_length: rows.insert(1, f"{name2}: {shared.history['internal'][i][1].strip()}\n") - if not (shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>'): + prev_user_input = shared.history['internal'][i][0] + if prev_user_input and not shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>': rows.insert(1, f"{name1}: {shared.history['internal'][i][0].strip()}\n") i -= 1 if not impersonate: - rows.append(f"{name1}: {user_input}\n") + if user_input: + rows.append(f"{name1}: {user_input}\n") rows.append(apply_extensions(f"{name2}:", "bot_prefix")) limit = 3 else: From eac27f4f556b2e4fd149e65e2395fbc9ce2ea3c7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 00:55:33 -0300 Subject: [PATCH 07/42] Make LoRAs work in 16-bit mode --- modules/LoRA.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 6915e157..20850338 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -13,10 +13,15 @@ def add_lora_to_model(lora_name): print("Reloading the model to remove the LoRA...") shared.model, shared.tokenizer = load_model(shared.model_name) else: - # Why doesn't this work in 16-bit mode? print(f"Adding the LoRA {lora_name} to the model...") - + params = {} - params['device_map'] = {'': 0} - #params['dtype'] = shared.model.dtype + if shared.args.load_in_8bit: + params['device_map'] = {'': 0} + else: + params['device_map'] = 'auto' + params['dtype'] = shared.model.dtype + shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params) + if not shared.args.load_in_8bit: + shared.model.half() From 29bd41d453cc8404b7183af685cdd4b952e96435 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 01:05:13 -0300 Subject: [PATCH 08/42] Fix LoRA in CPU mode --- modules/LoRA.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 20850338..0a2aaa7d 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -18,10 +18,10 @@ def add_lora_to_model(lora_name): params = {} if shared.args.load_in_8bit: params['device_map'] = {'': 0} - else: + elif not shared.args.cpu: params['device_map'] = 'auto' params['dtype'] = shared.model.dtype shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params) - if not shared.args.load_in_8bit: + if not shared.args.load_in_8bit and not shared.args.cpu: shared.model.half() From c5ebcc5f7e862b1f2c6b1d807bbf2c1aadeb159e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 13:36:00 -0300 Subject: [PATCH 09/42] Change the default names (#518) * Update shared.py * Update settings-template.json --- modules/shared.py | 6 +++--- settings-template.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 8d591f4f..720c697e 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -27,9 +27,9 @@ settings = { 'max_new_tokens': 200, 'max_new_tokens_min': 1, 'max_new_tokens_max': 2000, - 'name1': 'Person 1', - 'name2': 'Person 2', - 'context': 'This is a conversation between two people.', + 'name1': 'You', + 'name2': 'Assistant', + 'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.', 'stop_at_newline': False, 'chat_prompt_size': 2048, 'chat_prompt_size_min': 0, diff --git a/settings-template.json b/settings-template.json index 7a7de7af..79fd5023 100644 --- a/settings-template.json +++ b/settings-template.json @@ -2,9 +2,9 @@ "max_new_tokens": 200, "max_new_tokens_min": 1, "max_new_tokens_max": 2000, - "name1": "Person 1", - "name2": "Person 2", - "context": "This is a conversation between two people.", + "name1": "You", + "name2": "Assistant", + "context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.", "stop_at_newline": false, "chat_prompt_size": 2048, "chat_prompt_size_min": 0, From 9bf6ecf9e2de9b72c3fa62e0e6f5b5e9041825b1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 16:49:41 -0300 Subject: [PATCH 10/42] Fix LoRA device map (attempt) --- modules/LoRA.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 0a2aaa7d..5f77e340 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -16,12 +16,15 @@ def add_lora_to_model(lora_name): print(f"Adding the LoRA {lora_name} to the model...") params = {} - if shared.args.load_in_8bit: - params['device_map'] = {'': 0} - elif not shared.args.cpu: - params['device_map'] = 'auto' + if not shared.args.cpu: params['dtype'] = shared.model.dtype + if hasattr(shared.model, "hf_device_map"): + params['device_map'] = {"base_model.model."+k: v for k, v in shared.model.hf_device_map.items()} + elif shared.args.load_in_8bit: + params['device_map'] = {'': 0} shared.model = PeftModel.from_pretrained(shared.model, Path(f"loras/{lora_name}"), **params) if not shared.args.load_in_8bit and not shared.args.cpu: shared.model.half() + if not hasattr(shared.model, "hf_device_map"): + shared.model.cuda() From 4578e88ffd77dc249fa97d0ec8cb667b21089ba8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 21:38:20 -0300 Subject: [PATCH 11/42] Stop the bot from talking for you in chat mode --- modules/RWKV.py | 4 ++-- modules/callbacks.py | 20 ++++++++--------- modules/chat.py | 44 ++++++++++++++----------------------- modules/text_generation.py | 45 +++++++++++++++++++------------------- 4 files changed, 51 insertions(+), 62 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 5cf8937a..8c7ea2b9 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -45,11 +45,11 @@ class RWKVModel: token_stop = token_stop ) - return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) + return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) def generate_with_streaming(self, **kwargs): with Iteratorize(self.generate, kwargs, callback=None) as generator: - reply = kwargs['context'] + reply = '' for token in generator: reply += token yield reply diff --git a/modules/callbacks.py b/modules/callbacks.py index 12a90cc3..2ae9d908 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -11,24 +11,22 @@ import modules.shared as shared # Copied from https://github.com/PygmalionAI/gradio-ui/ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): - def __init__(self, sentinel_token_ids: torch.LongTensor, - starting_idx: int): + def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int): transformers.StoppingCriteria.__init__(self) self.sentinel_token_ids = sentinel_token_ids self.starting_idx = starting_idx - def __call__(self, input_ids: torch.LongTensor, - _scores: torch.FloatTensor) -> bool: + def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool: for sample in input_ids: trimmed_sample = sample[self.starting_idx:] - # Can't unfold, output is still too tiny. Skip. - if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]: - continue - for window in trimmed_sample.unfold( - 0, self.sentinel_token_ids.shape[-1], 1): - if torch.all(torch.eq(self.sentinel_token_ids, window)): - return True + for i in range(len(self.sentinel_token_ids)): + # Can't unfold, output is still too tiny. Skip. + if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]: + continue + for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1): + if torch.all(torch.eq(self.sentinel_token_ids[i], window)): + return True return False class Stream(transformers.StoppingCriteria): diff --git a/modules/chat.py b/modules/chat.py index 78fc4ab5..b1280d48 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -51,41 +51,31 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat prompt = ''.join(rows) return prompt -def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False): +def extract_message_from_reply(reply, name1, name2, check): next_character_found = False - asker = name1 if not impersonate else name2 - replier = name2 if not impersonate else name1 - - previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)] - idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)] - idx = idx[max(len(previous_idx)-1, 0)] - - if not impersonate: - reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):] - else: - reply = reply[idx + 1 + len(f"{replier}:"):] - if check: lines = reply.split('\n') reply = lines[0].strip() if len(lines) > 1: next_character_found = True else: - idx = reply.find(f"\n{asker}:") - if idx != -1: - reply = reply[:idx] - next_character_found = True - reply = fix_newlines(reply) + for string in [f"\n{name1}:", f"\n{name2}:"]: + idx = reply.find(string) + if idx != -1: + reply = reply[:idx] + next_character_found = True # If something like "\nYo" is generated just before "\nYou:" # is completed, trim it - next_turn = f"\n{asker}:" - for j in range(len(next_turn)-1, 0, -1): - if reply[-j:] == next_turn[:j]: - reply = reply[:-j] - break + if not next_character_found: + for string in [f"\n{name1}:", f"\n{name2}:"]: + for j in range(len(string)-1, 0, -1): + if reply[-j:] == string[:j]: + reply = reply[:-j] + break + reply = fix_newlines(reply) return reply, next_character_found def stop_everything_event(): @@ -127,10 +117,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical # Generate reply = '' for i in range(chat_generation_attempts): - for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name1}:"): + for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): # Extracting the reply - reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check) + reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) visible_reply = re.sub("(||{{user}})", name1_original, reply) visible_reply = apply_extensions(visible_reply, "output") if shared.args.chat: @@ -166,8 +156,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ # Yield *Is typing...* yield shared.processing_message for i in range(chat_generation_attempts): - for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name2}:"): - reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True) + for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) yield reply if next_character_found: break diff --git a/modules/text_generation.py b/modules/text_generation.py index e738cb21..fd017e2c 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -99,25 +99,37 @@ def set_manual_seed(seed): if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) -def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_string=None): +def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]): clear_torch_cache() set_manual_seed(seed) t0 = time.time() + original_question = question + if not (shared.args.chat or shared.args.cai_chat): + question = apply_extensions(question, "input") + if shared.args.verbose: + print(f"\n\n{question}\n--------------------\n") + # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier if shared.is_RWKV: try: if shared.args.no_stream: reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) + if not (shared.args.chat or shared.args.cai_chat): + reply = original_question + apply_extensions(reply, "output") yield formatted_outputs(reply, shared.model_name) else: if not (shared.args.chat or shared.args.cai_chat): yield formatted_outputs(question, shared.model_name) + # RWKV has proper streaming, which is very nice. # No need to generate 8 tokens at a time. for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k): + if not (shared.args.chat or shared.args.cai_chat): + reply = original_question + apply_extensions(reply, "output") yield formatted_outputs(reply, shared.model_name) + except Exception: traceback.print_exc() finally: @@ -127,12 +139,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") return - original_question = question - if not (shared.args.chat or shared.args.cai_chat): - question = apply_extensions(question, "input") - if shared.args.verbose: - print(f"\n\n{question}\n--------------------\n") - input_ids = encode(question, max_new_tokens) original_input_ids = input_ids output = input_ids[0] @@ -142,9 +148,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if eos_token is not None: eos_token_ids.append(int(encode(eos_token)[0][-1])) stopping_criteria_list = transformers.StoppingCriteriaList() - if stopping_string is not None: - # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py - t = encode(stopping_string, 0, add_special_tokens=False) + if type(stopping_strings) is list and len(stopping_strings) > 0: + t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings] stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0]))) generate_params = {} @@ -195,12 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) yield formatted_outputs(reply, shared.model_name) @@ -223,12 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi for output in generator: if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) if output[-1] in eos_token_ids: break @@ -244,12 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi output = shared.model.generate(**generate_params)[0] if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(original_input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(original_input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)): break From bf22d16ebcee96430d6845c9786bbdab5e74af17 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 21:56:26 -0300 Subject: [PATCH 12/42] Clear cache while switching LoRAs --- modules/LoRA.py | 15 +++++++++------ modules/callbacks.py | 8 +------- server.py | 14 +++----------- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 5f77e340..1c03826b 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -2,19 +2,22 @@ from pathlib import Path import modules.shared as shared from modules.models import load_model +from modules.text_generation import clear_torch_cache +def reload_model(): + shared.model = shared.tokenizer = None + clear_torch_cache() + shared.model, shared.tokenizer = load_model(shared.model_name) + def add_lora_to_model(lora_name): from peft import PeftModel - # Is there a more efficient way of returning to the base model? - if lora_name == "None": - print("Reloading the model to remove the LoRA...") - shared.model, shared.tokenizer = load_model(shared.model_name) - else: + reload_model() + + if lora_name != "None": print(f"Adding the LoRA {lora_name} to the model...") - params = {} if not shared.args.cpu: params['dtype'] = shared.model.dtype diff --git a/modules/callbacks.py b/modules/callbacks.py index 2ae9d908..50a69183 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -1,11 +1,10 @@ -import gc from queue import Queue from threading import Thread import torch import transformers -import modules.shared as shared +from modules.text_generation import clear_torch_cache # Copied from https://github.com/PygmalionAI/gradio-ui/ @@ -90,8 +89,3 @@ class Iteratorize: def __exit__(self, exc_type, exc_val, exc_tb): self.stop_now = True clear_torch_cache() - -def clear_torch_cache(): - gc.collect() - if not shared.args.cpu: - torch.cuda.empty_cache() diff --git a/server.py b/server.py index cdf7aa93..068f380a 100644 --- a/server.py +++ b/server.py @@ -1,4 +1,3 @@ -import gc import io import json import re @@ -8,7 +7,6 @@ import zipfile from pathlib import Path import gradio as gr -import torch import modules.chat as chat import modules.extensions as extensions_module @@ -17,7 +15,7 @@ import modules.ui as ui from modules.html_generator import generate_chat_html from modules.LoRA import add_lora_to_model from modules.models import load_model, load_soft_prompt -from modules.text_generation import generate_reply +from modules.text_generation import clear_torch_cache, generate_reply # Loading custom settings settings_file = None @@ -56,21 +54,15 @@ def load_model_wrapper(selected_model): if selected_model != shared.model_name: shared.model_name = selected_model shared.model = shared.tokenizer = None - if not shared.args.cpu: - gc.collect() - torch.cuda.empty_cache() + clear_torch_cache() shared.model, shared.tokenizer = load_model(shared.model_name) return selected_model def load_lora_wrapper(selected_lora): shared.lora_name = selected_lora - default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')] - - if not shared.args.cpu: - gc.collect() - torch.cuda.empty_cache() add_lora_to_model(selected_lora) + default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')] return selected_lora, default_text From b0abb327d822f8fe4c0180a4a725c0e362182b8f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:02:09 -0300 Subject: [PATCH 13/42] Update LoRA.py --- modules/LoRA.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 1c03826b..aa68ad32 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -14,7 +14,11 @@ def add_lora_to_model(lora_name): from peft import PeftModel - reload_model() + # If a LoRA had been previously loaded, or if we want + # to unload a LoRA, reload the model + if shared.lora_name != "None" or lora_name == "None": + reload_model() + shared.lora_name = lora_name if lora_name != "None": print(f"Adding the LoRA {lora_name} to the model...") From 9bdb3c784d07b4f81f8dc39a97796d231bd89bff Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:02:40 -0300 Subject: [PATCH 14/42] Minor fix --- server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/server.py b/server.py index 068f380a..435b8525 100644 --- a/server.py +++ b/server.py @@ -60,7 +60,6 @@ def load_model_wrapper(selected_model): return selected_model def load_lora_wrapper(selected_lora): - shared.lora_name = selected_lora add_lora_to_model(selected_lora) default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')] From d1327f99f915aca83abac739107cdb8c5d29d278 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:12:24 -0300 Subject: [PATCH 15/42] Fix broken callbacks.py --- modules/callbacks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/callbacks.py b/modules/callbacks.py index 50a69183..93cd1d63 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -4,8 +4,6 @@ from threading import Thread import torch import transformers -from modules.text_generation import clear_torch_cache - # Copied from https://github.com/PygmalionAI/gradio-ui/ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): @@ -89,3 +87,8 @@ class Iteratorize: def __exit__(self, exc_type, exc_val, exc_tb): self.stop_now = True clear_torch_cache() + +def clear_torch_cache(): + gc.collect() + if not shared.args.cpu: + torch.cuda.empty_cache() From 7078d168c31084255a99e1b4fd879e9a8a353a0d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:16:08 -0300 Subject: [PATCH 16/42] Missing import --- modules/callbacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/callbacks.py b/modules/callbacks.py index 93cd1d63..40811408 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -1,3 +1,4 @@ +import gc from queue import Queue from threading import Thread From 8747c74339cf1e7f1d45f4aa1dcc090e9eba94a3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:19:01 -0300 Subject: [PATCH 17/42] Another missing import --- modules/callbacks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/callbacks.py b/modules/callbacks.py index 40811408..2ae9d908 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -5,6 +5,8 @@ from threading import Thread import torch import transformers +import modules.shared as shared + # Copied from https://github.com/PygmalionAI/gradio-ui/ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): From dcfd866402dfbbc849bd4441fd1de9448de18c75 Mon Sep 17 00:00:00 2001 From: EyeDeck Date: Thu, 23 Mar 2023 21:31:34 -0400 Subject: [PATCH 18/42] Allow loading of .safetensors through GPTQ-for-LLaMa --- modules/GPTQ_loader.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 32a5458f..bec6c66f 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -37,21 +37,23 @@ def load_quantized(model_name): path_to_model = Path(f'models/{model_name}') if path_to_model.name.lower().startswith('llama-7b'): - pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-7b-{shared.args.gptq_bits}bit' elif path_to_model.name.lower().startswith('llama-13b'): - pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-13b-{shared.args.gptq_bits}bit' elif path_to_model.name.lower().startswith('llama-30b'): - pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-30b-{shared.args.gptq_bits}bit' elif path_to_model.name.lower().startswith('llama-65b'): - pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-65b-{shared.args.gptq_bits}bit' else: - pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt' + pt_model = f'{model_name}-{shared.args.gptq_bits}bit' - # Try to find the .pt both in models/ and in the subfolder + # Try to find the .safetensors or .pt both in models/ and in the subfolder pt_path = None - for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]: + for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]: if path.exists(): + print(f"Found {path}") pt_path = path + break if not pt_path: print(f"Could not find {pt_model}, exiting...") From 143b5b5edf5d47539496598dbdb6cfe4843c169a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 23 Mar 2023 23:28:50 -0300 Subject: [PATCH 19/42] Mention one-click-bandaid in the README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index cb070445..85dcc270 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,10 @@ Just download the zip above, extract it, and double click on "install". The web Source codes: https://github.com/oobabooga/one-click-installers +> **Note** +> +> To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid). + This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652 ### Alternative: Docker From bb4cb2245373acb950e1c8dbaa73caf75920723d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 00:49:04 -0300 Subject: [PATCH 20/42] Download .pt files using download-model.py (for 4-bit models) --- download-model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/download-model.py b/download-model.py index 7c2965f6..7ca33b7d 100644 --- a/download-model.py +++ b/download-model.py @@ -116,10 +116,11 @@ def get_download_links_from_huggingface(model, branch): is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname) is_safetensors = re.match("model.*\.safetensors", fname) + is_pt = re.match(".*\.pt", fname) is_tokenizer = re.match("tokenizer.*\.model", fname) is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer - if any((is_pytorch, is_safetensors, is_text, is_tokenizer)): + if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)): if is_text: links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}") classifications.append('text') @@ -132,7 +133,8 @@ def get_download_links_from_huggingface(model, branch): elif is_pytorch: has_pytorch = True classifications.append('pytorch') - + elif is_pt: + classifications.append('pt') cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50' cursor = base64.b64encode(cursor) From 04417b658b53207c805851145c96bc1ce903937b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 01:40:43 -0300 Subject: [PATCH 21/42] Update README.md --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 85dcc270..4e4959ac 100644 --- a/README.md +++ b/README.md @@ -84,10 +84,6 @@ pip install -r requirements.txt > > For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859 -### Alternative: native Windows installation - -As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings). - ### Alternative: one-click installers [oobabooga-windows.zip](https://github.com/oobabooga/one-click-installers/archive/refs/heads/oobabooga-windows.zip) @@ -105,7 +101,9 @@ Source codes: https://github.com/oobabooga/one-click-installers > > To get 8-bit and 4-bit models working in your 1-click Windows installation, you can use the [one-click-bandaid](https://github.com/ClayShoaf/oobabooga-one-click-bandaid). -This method lags behind the newest developments and does not support 8-bit mode on Windows without additional set up: https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134, https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652 +### Alternative: native Windows installation + +As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Installation instructions for human beings](https://github.com/oobabooga/text-generation-webui/wiki/Installation-instructions-for-human-beings). ### Alternative: Docker From 4f5c2ce78560689dc8ed08a3cbb33ef15a3b4a95 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 02:03:30 -0300 Subject: [PATCH 22/42] Fix chat_generation_attempts --- modules/chat.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index b1280d48..061177d2 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -115,9 +115,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical yield shared.history['visible']+[[visible_text, shared.processing_message]] # Generate - reply = '' + cumulative_reply = '' for i in range(chat_generation_attempts): - for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + reply = cumulative_reply + reply # Extracting the reply reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) @@ -142,6 +143,8 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical if next_character_found: break + cumulative_reply = reply + yield shared.history['visible'] def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1): @@ -152,16 +155,21 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True) - reply = '' # Yield *Is typing...* yield shared.processing_message + + cumulative_reply = '' for i in range(chat_generation_attempts): - for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + reply = cumulative_reply + reply reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) yield reply if next_character_found: break - yield reply + + cumulative_reply = reply + + yield reply def cai_chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1): for _history in chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, name1, name2, context, check, chat_prompt_size, chat_generation_attempts): From b740c5b2847ee778c20f8232d94f25ab84fce108 Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Fri, 24 Mar 2023 08:56:07 -0500 Subject: [PATCH 23/42] Add display of context when input was generated Not sure if I did this right but it does move with the conversation and seems to match value. --- modules/text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index fd017e2c..9b2c233d 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -270,5 +270,5 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi traceback.print_exc() finally: t1 = time.time() - print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)") + print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens, context {len(original_input_ids[0])})") return From fd99995b01878246b62302d31a844dd68ee7d139 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 15:59:27 -0300 Subject: [PATCH 24/42] Make the Stop button more consistent in chat mode --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 435b8525..7b25e91d 100644 --- a/server.py +++ b/server.py @@ -329,7 +329,7 @@ def create_interface(): gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)) gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)) gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream)) - shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events) + shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events, queue=False) shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream) shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio['textbox'], shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'], show_progress=shared.args.no_stream) From d8e950d6bdf933f8a0cd78a0c7cb2a941b8d32e3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:30:32 -0300 Subject: [PATCH 25/42] Don't load the model twice when using --lora --- server.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server.py b/server.py index 7b25e91d..f423e368 100644 --- a/server.py +++ b/server.py @@ -233,9 +233,7 @@ else: shared.model_name = available_models[i] shared.model, shared.tokenizer = load_model(shared.model_name) if shared.args.lora: - print(shared.args.lora) - shared.lora_name = shared.args.lora - add_lora_to_model(shared.lora_name) + add_lora_to_model(shared.args.lora) # Default UI settings default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')] From 8fad84abc2c8eb90718ef7d7084a22c740e20d9b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:51:27 -0300 Subject: [PATCH 26/42] Update extensions.py --- modules/extensions.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/extensions.py b/modules/extensions.py index b363bc39..c55dc978 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -1,3 +1,5 @@ +import traceback + import gradio as gr import extensions @@ -18,6 +20,7 @@ def load_extensions(): print('Ok.') except: print('Fail.') + traceback.print_exc() # This iterator returns the extensions in the order specified in the command-line def iterator(): @@ -38,6 +41,7 @@ def apply_extensions(text, typ): def create_extensions_block(): global setup_called + # Updating the default values for extension, name in iterator(): if hasattr(extension, 'params'): @@ -47,6 +51,7 @@ def create_extensions_block(): extension.params[param] = shared.settings[_id] should_display_ui = False + # Running setup function if not setup_called: for extension, name in iterator(): From 4a724ed22fc8942677f44df39674b571450ea51c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:53:56 -0300 Subject: [PATCH 27/42] Reorder imports --- extensions/api/script.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/extensions/api/script.py b/extensions/api/script.py index bbd8551a..7783594c 100644 --- a/extensions/api/script.py +++ b/extensions/api/script.py @@ -1,8 +1,9 @@ +import json from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from threading import Thread + from modules import shared -from modules.text_generation import generate_reply, encode -import json +from modules.text_generation import encode, generate_reply params = { 'port': 5000, @@ -87,4 +88,4 @@ def run_server(): server.serve_forever() def setup(): - Thread(target=run_server, daemon=True).start() \ No newline at end of file + Thread(target=run_server, daemon=True).start() From ffb0187e83043ddbbc7ab1b29e843a1ee6107b54 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 17:17:29 -0300 Subject: [PATCH 28/42] Update chat.py --- modules/chat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index c1e55ac4..0dc5c922 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -34,12 +34,12 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat while i >= 0 and len(encode(''.join(rows), max_new_tokens)[0]) < max_length: rows.insert(1, f"{name2}: {shared.history['internal'][i][1].strip()}\n") prev_user_input = shared.history['internal'][i][0] - if prev_user_input and not shared.history['internal'][i][0] == '<|BEGIN-VISIBLE-CHAT|>': - rows.insert(1, f"{name1}: {shared.history['internal'][i][0].strip()}\n") + if len(prev_user_input) > 0 and prev_user_input != '<|BEGIN-VISIBLE-CHAT|>': + rows.insert(1, f"{name1}: {prev_user_input.strip()}\n") i -= 1 if not impersonate: - if user_input: + if len(user_input) > 0: rows.append(f"{name1}: {user_input}\n") rows.append(apply_extensions(f"{name2}:", "bot_prefix")) limit = 3 From 6e1b16c2aa87d167ed9893e800c199408ba946d3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 17:18:27 -0300 Subject: [PATCH 29/42] Update html_generator.py --- modules/html_generator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index f8cff6d8..ff18c913 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -119,13 +119,13 @@ def load_html_image(paths): def generate_chat_html(history, name1, name2, character): output = f'
' - + img_bot = load_html_image([f"characters/{character}.{ext}" for ext in ['png', 'jpg', 'jpeg']] + ["img_bot.png","img_bot.jpg","img_bot.jpeg"]) img_me = load_html_image(["img_me.png", "img_me.jpg", "img_me.jpeg"]) for i,_row in enumerate(history[::-1]): row = [convert_to_markdown(entry) for entry in _row] - + output += f"""
@@ -142,8 +142,9 @@ def generate_chat_html(history, name1, name2, character):
""" - if not row[0]: # don't display empty user messages + if len(row[0]) == 0: # don't display empty user messages continue + output += f"""
From a80aa65986ec159a5f9a198455b4f8f061b0d52f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:53:20 -0300 Subject: [PATCH 30/42] Update models.py --- modules/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/models.py b/modules/models.py index ccb97da3..c9f03588 100644 --- a/modules/models.py +++ b/modules/models.py @@ -44,7 +44,7 @@ def load_model(model_name): shared.is_RWKV = model_name.lower().startswith('rwkv-') # Default settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -95,7 +95,7 @@ def load_model(model_name): return model, tokenizer # Quantized model - elif shared.args.gptq_bits > 0: + elif shared.args.wbits > 0: from modules.GPTQ_loader import load_quantized model = load_quantized(model_name) From 0a162244513b9d2946ad6ffff8b40c8d77e342ae Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:54:36 -0300 Subject: [PATCH 31/42] Update GPTQ_loader.py --- modules/GPTQ_loader.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 32a5458f..b58c8964 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -14,7 +14,7 @@ import opt def load_quantized(model_name): - if not shared.args.gptq_model_type: + if not shared.args.model_type: # Try to determine model type from model name model_type = model_name.split('-')[0].lower() if model_type not in ('llama', 'opt'): @@ -22,10 +22,10 @@ def load_quantized(model_name): "argument") exit() else: - model_type = shared.args.gptq_model_type.lower() + model_type = shared.args.model_type.lower() if model_type == 'llama': - if not shared.args.gptq_pre_layer: + if not shared.args.pre_layer: load_quant = llama.load_quant else: load_quant = llama_inference_offload.load_quant @@ -37,15 +37,15 @@ def load_quantized(model_name): path_to_model = Path(f'models/{model_name}') if path_to_model.name.lower().startswith('llama-7b'): - pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-7b-{shared.args.wbits}bit.pt' elif path_to_model.name.lower().startswith('llama-13b'): - pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-13b-{shared.args.wbits}bit.pt' elif path_to_model.name.lower().startswith('llama-30b'): - pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-30b-{shared.args.wbits}bit.pt' elif path_to_model.name.lower().startswith('llama-65b'): - pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt' + pt_model = f'llama-65b-{shared.args.wbits}bit.pt' else: - pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt' + pt_model = f'{model_name}-{shared.args.wbits}bit.pt' # Try to find the .pt both in models/ and in the subfolder pt_path = None @@ -58,10 +58,10 @@ def load_quantized(model_name): exit() # qwopqwop200's offload - if shared.args.gptq_pre_layer: - model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer) + if shared.args.pre_layer: + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.pre_layer) else: - model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits) + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits) # accelerate offload (doesn't work properly) if shared.args.gpu_memory: From a6bf54739c61ac230e94f95ff209004221efeb86 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:56:45 -0300 Subject: [PATCH 32/42] Revert models.py (accident) --- modules/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/models.py b/modules/models.py index c9f03588..ccb97da3 100644 --- a/modules/models.py +++ b/modules/models.py @@ -44,7 +44,7 @@ def load_model(model_name): shared.is_RWKV = model_name.lower().startswith('rwkv-') # Default settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -95,7 +95,7 @@ def load_model(model_name): return model, tokenizer # Quantized model - elif shared.args.wbits > 0: + elif shared.args.gptq_bits > 0: from modules.GPTQ_loader import load_quantized model = load_quantized(model_name) From 9fa47c0eed275f64a16c01bea7df2c9aba16f13b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:57:12 -0300 Subject: [PATCH 33/42] Revert GPTQ_loader.py (accident) --- modules/GPTQ_loader.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index b58c8964..32a5458f 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -14,7 +14,7 @@ import opt def load_quantized(model_name): - if not shared.args.model_type: + if not shared.args.gptq_model_type: # Try to determine model type from model name model_type = model_name.split('-')[0].lower() if model_type not in ('llama', 'opt'): @@ -22,10 +22,10 @@ def load_quantized(model_name): "argument") exit() else: - model_type = shared.args.model_type.lower() + model_type = shared.args.gptq_model_type.lower() if model_type == 'llama': - if not shared.args.pre_layer: + if not shared.args.gptq_pre_layer: load_quant = llama.load_quant else: load_quant = llama_inference_offload.load_quant @@ -37,15 +37,15 @@ def load_quantized(model_name): path_to_model = Path(f'models/{model_name}') if path_to_model.name.lower().startswith('llama-7b'): - pt_model = f'llama-7b-{shared.args.wbits}bit.pt' + pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt' elif path_to_model.name.lower().startswith('llama-13b'): - pt_model = f'llama-13b-{shared.args.wbits}bit.pt' + pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt' elif path_to_model.name.lower().startswith('llama-30b'): - pt_model = f'llama-30b-{shared.args.wbits}bit.pt' + pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt' elif path_to_model.name.lower().startswith('llama-65b'): - pt_model = f'llama-65b-{shared.args.wbits}bit.pt' + pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt' else: - pt_model = f'{model_name}-{shared.args.wbits}bit.pt' + pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt' # Try to find the .pt both in models/ and in the subfolder pt_path = None @@ -58,10 +58,10 @@ def load_quantized(model_name): exit() # qwopqwop200's offload - if shared.args.pre_layer: - model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.pre_layer) + if shared.args.gptq_pre_layer: + model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer) else: - model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits) + model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits) # accelerate offload (doesn't work properly) if shared.args.gpu_memory: From 25be9698c74d7af950cbcbf8ec4c0cd9bebc6d3c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:18:32 -0300 Subject: [PATCH 34/42] Fix LoRA on mps --- modules/LoRA.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index aa68ad32..283fcf4c 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -1,5 +1,7 @@ from pathlib import Path +import torch + import modules.shared as shared from modules.models import load_model from modules.text_generation import clear_torch_cache @@ -34,4 +36,8 @@ def add_lora_to_model(lora_name): if not shared.args.load_in_8bit and not shared.args.cpu: shared.model.half() if not hasattr(shared.model, "hf_device_map"): - shared.model.cuda() + if torch.has_mps: + device = torch.device('mps') + shared.model = shared.model.to(device) + else: + shared.model = shared.model.cuda() From 70f9565f37c47be34d4bdbabe3c874bc4c4c7039 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 25 Mar 2023 02:35:30 -0300 Subject: [PATCH 35/42] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e4959ac..60444401 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. * [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen). * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed). * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming. -* [LLaMA model, including 4-bit mode](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model). +* [LLaMA model, including 4-bit GPTQ support](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model). * [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model). * [Supports LoRAs](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs). * Supports softprompts. From 8c8e8b44508972a37fd15d760f9e4214e5105306 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 25 Mar 2023 12:35:52 -0300 Subject: [PATCH 36/42] Fix the early stopping callback #559 --- modules/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/callbacks.py b/modules/callbacks.py index 2ae9d908..8d30d615 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -25,7 +25,7 @@ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]: continue for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1): - if torch.all(torch.eq(self.sentinel_token_ids[i], window)): + if torch.all(torch.eq(self.sentinel_token_ids[i][0], window)): return True return False From 9ccf505ccd8484a6af6bb954ce1deea6ce035b05 Mon Sep 17 00:00:00 2001 From: "Alex \"mcmonkey\" Goodwin" Date: Sat, 25 Mar 2023 10:04:00 -0700 Subject: [PATCH 37/42] improve/simplify gitignore - add repositories - remove the redundant "/*" on folders - remove the exclusions for files that already exist --- .gitignore | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 3cfbbb22..00198b8f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,20 @@ -cache/* -characters/* -extensions/silero_tts/outputs/* -extensions/elevenlabs_tts/outputs/* -extensions/sd_api_pictures/outputs/* -logs/* -loras/* -models/* -softprompts/* -torch-dumps/* +cache +characters +extensions/silero_tts/outputs +extensions/elevenlabs_tts/outputs +extensions/sd_api_pictures/outputs +logs +loras +models +softprompts +torch-dumps *pycache* */*pycache* */*/pycache* venv/ .venv/ +repositories settings.json img_bot* img_me* - -!characters/Example.json -!characters/Example.png -!loras/place-your-loras-here.txt -!models/place-your-models-here.txt -!softprompts/place-your-softprompts-here.txt -!torch-dumps/place-your-pt-models-here.txt From 8134c4b334ecc3a6a30e774a7265cbafc42ac6cc Mon Sep 17 00:00:00 2001 From: "Alex \"mcmonkey\" Goodwin" Date: Sat, 25 Mar 2023 12:41:18 -0700 Subject: [PATCH 38/42] add training/datsets to gitignore for #570 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 00198b8f..36852916 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ cache characters +training/datasets extensions/silero_tts/outputs extensions/elevenlabs_tts/outputs extensions/sd_api_pictures/outputs From 49c10c5570b595e9d4fdcb496c456a9982ede070 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 26 Mar 2023 00:11:33 -0300 Subject: [PATCH 39/42] Add support for the latest GPTQ models with group-size (#530) **Warning: old 4-bit weights will not work anymore!** See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights --- README.md | 8 ++--- modules/GPTQ_loader.py | 66 +++++++++++++++++++++++++----------------- modules/models.py | 4 +-- modules/shared.py | 24 ++++++++++----- server.py | 5 ++-- 5 files changed, 64 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 60444401..3bfbc72f 100644 --- a/README.md +++ b/README.md @@ -176,10 +176,10 @@ Optionally, you can use the following command-line flags: | `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. | | `--cpu` | Use the CPU to generate text.| | `--load-in-8bit` | Load the model with 8-bit precision.| -| `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. | -| `--gptq-bits GPTQ_BITS` | GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. | -| `--gptq-model-type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. | -| `--gptq-pre-layer GPTQ_PRE_LAYER` | GPTQ: The number of layers to preload. | +| `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | +| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. | +| `--groupsize GROUPSIZE` | GPTQ: Group size. | +| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.| | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. | diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index bec6c66f..afb5695f 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -14,18 +14,21 @@ import opt def load_quantized(model_name): - if not shared.args.gptq_model_type: + if not shared.args.model_type: # Try to determine model type from model name - model_type = model_name.split('-')[0].lower() - if model_type not in ('llama', 'opt'): - print("Can't determine model type from model name. Please specify it manually using --gptq-model-type " + if model_name.lower().startswith(('llama', 'alpaca')): + model_type = 'llama' + elif model_name.lower().startswith(('opt', 'galactica')): + model_type = 'opt' + else: + print("Can't determine model type from model name. Please specify it manually using --model_type " "argument") exit() else: - model_type = shared.args.gptq_model_type.lower() + model_type = shared.args.model_type.lower() if model_type == 'llama': - if not shared.args.gptq_pre_layer: + if not shared.args.pre_layer: load_quant = llama.load_quant else: load_quant = llama_inference_offload.load_quant @@ -35,35 +38,44 @@ def load_quantized(model_name): print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported") exit() + # Now we are going to try to locate the quantized model file. path_to_model = Path(f'models/{model_name}') - if path_to_model.name.lower().startswith('llama-7b'): - pt_model = f'llama-7b-{shared.args.gptq_bits}bit' - elif path_to_model.name.lower().startswith('llama-13b'): - pt_model = f'llama-13b-{shared.args.gptq_bits}bit' - elif path_to_model.name.lower().startswith('llama-30b'): - pt_model = f'llama-30b-{shared.args.gptq_bits}bit' - elif path_to_model.name.lower().startswith('llama-65b'): - pt_model = f'llama-65b-{shared.args.gptq_bits}bit' - else: - pt_model = f'{model_name}-{shared.args.gptq_bits}bit' - - # Try to find the .safetensors or .pt both in models/ and in the subfolder + found_pts = list(path_to_model.glob("*.pt")) + found_safetensors = list(path_to_model.glob("*.safetensors")) pt_path = None - for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]: - if path.exists(): - print(f"Found {path}") - pt_path = path - break + + if len(found_pts) == 1: + pt_path = found_pts[0] + elif len(found_safetensors) == 1: + pt_path = found_safetensors[0] + else: + if path_to_model.name.lower().startswith('llama-7b'): + pt_model = f'llama-7b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-13b'): + pt_model = f'llama-13b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-30b'): + pt_model = f'llama-30b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-65b'): + pt_model = f'llama-65b-{shared.args.wbits}bit' + else: + pt_model = f'{model_name}-{shared.args.wbits}bit' + + # Try to find the .safetensors or .pt both in models/ and in the subfolder + for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]: + if path.exists(): + print(f"Found {path}") + pt_path = path + break if not pt_path: - print(f"Could not find {pt_model}, exiting...") + print("Could not find the quantized model in .pt or .safetensors format, exiting...") exit() # qwopqwop200's offload - if shared.args.gptq_pre_layer: - model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer) + if shared.args.pre_layer: + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer) else: - model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits) + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize) # accelerate offload (doesn't work properly) if shared.args.gpu_memory: diff --git a/modules/models.py b/modules/models.py index ccb97da3..c9f03588 100644 --- a/modules/models.py +++ b/modules/models.py @@ -44,7 +44,7 @@ def load_model(model_name): shared.is_RWKV = model_name.lower().startswith('rwkv-') # Default settings - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -95,7 +95,7 @@ def load_model(model_name): return model, tokenizer # Quantized model - elif shared.args.gptq_bits > 0: + elif shared.args.wbits > 0: from modules.GPTQ_loader import load_quantized model = load_quantized(model_name) diff --git a/modules/shared.py b/modules/shared.py index 720c697e..87896faf 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -52,7 +52,8 @@ settings = { 'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:', '^(gpt4chan|gpt-4chan|4chan)': '-----\n--- 865467536\nInput text\n--- 865467537\n', '(rosey|chip|joi)_.*_instruct.*': 'User: \n', - 'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>' + 'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>', + 'alpaca-*': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n", }, 'lora_prompts': { 'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:', @@ -78,10 +79,15 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.') parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.') parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.') -parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.') -parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.') -parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.') -parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.') + +parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.') +parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.') +parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.') +parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') +parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.') +parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') +parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.') + parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') @@ -109,6 +115,8 @@ parser.add_argument('--verbose', action='store_true', help='Print the prompts to args = parser.parse_args() # Provisional, this will be deleted later -if args.load_in_4bit: - print("Warning: --load-in-4bit is deprecated and will be removed. Use --gptq-bits 4 instead.\n") - args.gptq_bits = 4 +deprecated_dict = {'gptq_bits': ['wbits', 0], 'gptq_model_type': ['model_type', None], 'gptq_pre_layer': ['prelayer', 0]} +for k in deprecated_dict: + if eval(f"args.{k}") != deprecated_dict[k][1]: + print(f"Warning: --{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.") + exec(f"args.{deprecated_dict[k][0]} = args.{k}") diff --git a/server.py b/server.py index f423e368..f1b95a5b 100644 --- a/server.py +++ b/server.py @@ -237,8 +237,9 @@ if shared.args.lora: # Default UI settings default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')] -default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')] -if default_text == '': +if shared.lora_name != "None": + default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')] +else: default_text = shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.model_name.lower())), 'default')] title ='Text generation web UI' description = '\n\n# Text generation lab\nGenerate text using Large Language Models.\n' From 19174842b82505c4627f095ea910ea0a9c998e1c Mon Sep 17 00:00:00 2001 From: Florian Kusche Date: Sun, 26 Mar 2023 19:41:14 +0200 Subject: [PATCH 40/42] Also download Markdown files --- download-model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download-model.py b/download-model.py index 7ca33b7d..25386e5f 100644 --- a/download-model.py +++ b/download-model.py @@ -118,7 +118,7 @@ def get_download_links_from_huggingface(model, branch): is_safetensors = re.match("model.*\.safetensors", fname) is_pt = re.match(".*\.pt", fname) is_tokenizer = re.match("tokenizer.*\.model", fname) - is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer + is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)): if is_text: From 9ff6a538b6055b6845efd2f0e625386a847945eb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 26 Mar 2023 22:11:19 -0300 Subject: [PATCH 41/42] Bump gradio version Make sure to upgrade with `pip install -r requirements.txt --upgrade` --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e5b3de69..c84f2948 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ accelerate==0.17.1 bitsandbytes==0.37.1 flexgen==0.1.7 -gradio==3.18.0 +gradio==3.23.0 markdown numpy peft==0.2.0 From 1c77fdca4cdfca5c636595a8aaaff3281b859d3a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 26 Mar 2023 22:20:30 -0300 Subject: [PATCH 42/42] Change notebook mode appearance --- css/chat.css | 6 ++++++ css/main.css | 10 ++++++++++ css/main.js | 2 +- modules/extensions.py | 4 ++-- server.py | 25 ++++++++++++++----------- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/css/chat.css b/css/chat.css index 8d9d88a6..1e703530 100644 --- a/css/chat.css +++ b/css/chat.css @@ -23,3 +23,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .pending.svelte-1ed2p3z { opacity: 1; } + +#extensions { + padding: 0; + padding: 0; +} + diff --git a/css/main.css b/css/main.css index 09f3b6a8..97879f01 100644 --- a/css/main.css +++ b/css/main.css @@ -54,3 +54,13 @@ ol li p, ul li p { .gradio-container-3-18-0 .prose * h1, h2, h3, h4 { color: white; } + +.gradio-container { + max-width: 100% !important; + padding-top: 0 !important; +} + +#extensions { + padding: 15px; + padding: 15px; +} diff --git a/css/main.js b/css/main.js index 9db3fe8b..029ecb62 100644 --- a/css/main.js +++ b/css/main.js @@ -11,7 +11,7 @@ let extensions = document.getElementById('extensions'); main_parent.addEventListener('click', function(e) { // Check if the main element is visible if (main.offsetHeight > 0 && main.offsetWidth > 0) { - extensions.style.display = 'block'; + extensions.style.display = 'flex'; } else { extensions.style.display = 'none'; } diff --git a/modules/extensions.py b/modules/extensions.py index c55dc978..c3cf4de4 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -63,8 +63,8 @@ def create_extensions_block(): # Creating the extension ui elements if should_display_ui: - with gr.Box(elem_id="extensions"): - gr.Markdown("Extensions") + with gr.Column(elem_id="extensions"): for extension, name in iterator(): + gr.Markdown(f"\n### {name}") if hasattr(extension, "ui"): extension.ui() diff --git a/server.py b/server.py index f1b95a5b..56bb499d 100644 --- a/server.py +++ b/server.py @@ -369,19 +369,22 @@ def create_interface(): elif shared.args.notebook: with gr.Tab("Text generation", elem_id="main"): - with gr.Tab('Raw'): - shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=25) - with gr.Tab('Markdown'): - shared.gradio['markdown'] = gr.Markdown() - with gr.Tab('HTML'): - shared.gradio['html'] = gr.HTML() - with gr.Row(): - shared.gradio['Stop'] = gr.Button('Stop') - shared.gradio['Generate'] = gr.Button('Generate') - shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) + with gr.Column(scale=4): + with gr.Tab('Raw'): + shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_id="textbox", lines=25) + with gr.Tab('Markdown'): + shared.gradio['markdown'] = gr.Markdown() + with gr.Tab('HTML'): + shared.gradio['html'] = gr.HTML() - create_model_and_preset_menus() + with gr.Row(): + shared.gradio['Stop'] = gr.Button('Stop') + shared.gradio['Generate'] = gr.Button('Generate') + with gr.Column(scale=1): + shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) + + create_model_and_preset_menus() with gr.Tab("Parameters", elem_id="parameters"): create_settings_menus(default_preset)