diff --git a/modules/RWKV.py b/modules/RWKV.py index 5cf8937a..8c7ea2b9 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -45,11 +45,11 @@ class RWKVModel: token_stop = token_stop ) - return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) + return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) def generate_with_streaming(self, **kwargs): with Iteratorize(self.generate, kwargs, callback=None) as generator: - reply = kwargs['context'] + reply = '' for token in generator: reply += token yield reply diff --git a/modules/callbacks.py b/modules/callbacks.py index 12a90cc3..2ae9d908 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -11,24 +11,22 @@ import modules.shared as shared # Copied from https://github.com/PygmalionAI/gradio-ui/ class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): - def __init__(self, sentinel_token_ids: torch.LongTensor, - starting_idx: int): + def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int): transformers.StoppingCriteria.__init__(self) self.sentinel_token_ids = sentinel_token_ids self.starting_idx = starting_idx - def __call__(self, input_ids: torch.LongTensor, - _scores: torch.FloatTensor) -> bool: + def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool: for sample in input_ids: trimmed_sample = sample[self.starting_idx:] - # Can't unfold, output is still too tiny. Skip. - if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]: - continue - for window in trimmed_sample.unfold( - 0, self.sentinel_token_ids.shape[-1], 1): - if torch.all(torch.eq(self.sentinel_token_ids, window)): - return True + for i in range(len(self.sentinel_token_ids)): + # Can't unfold, output is still too tiny. Skip. + if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]: + continue + for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1): + if torch.all(torch.eq(self.sentinel_token_ids[i], window)): + return True return False class Stream(transformers.StoppingCriteria): diff --git a/modules/chat.py b/modules/chat.py index 78fc4ab5..b1280d48 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -51,41 +51,31 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat prompt = ''.join(rows) return prompt -def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False): +def extract_message_from_reply(reply, name1, name2, check): next_character_found = False - asker = name1 if not impersonate else name2 - replier = name2 if not impersonate else name1 - - previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)] - idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)] - idx = idx[max(len(previous_idx)-1, 0)] - - if not impersonate: - reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):] - else: - reply = reply[idx + 1 + len(f"{replier}:"):] - if check: lines = reply.split('\n') reply = lines[0].strip() if len(lines) > 1: next_character_found = True else: - idx = reply.find(f"\n{asker}:") - if idx != -1: - reply = reply[:idx] - next_character_found = True - reply = fix_newlines(reply) + for string in [f"\n{name1}:", f"\n{name2}:"]: + idx = reply.find(string) + if idx != -1: + reply = reply[:idx] + next_character_found = True # If something like "\nYo" is generated just before "\nYou:" # is completed, trim it - next_turn = f"\n{asker}:" - for j in range(len(next_turn)-1, 0, -1): - if reply[-j:] == next_turn[:j]: - reply = reply[:-j] - break + if not next_character_found: + for string in [f"\n{name1}:", f"\n{name2}:"]: + for j in range(len(string)-1, 0, -1): + if reply[-j:] == string[:j]: + reply = reply[:-j] + break + reply = fix_newlines(reply) return reply, next_character_found def stop_everything_event(): @@ -127,10 +117,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical # Generate reply = '' for i in range(chat_generation_attempts): - for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name1}:"): + for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): # Extracting the reply - reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check) + reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) visible_reply = re.sub("(||{{user}})", name1_original, reply) visible_reply = apply_extensions(visible_reply, "output") if shared.args.chat: @@ -166,8 +156,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ # Yield *Is typing...* yield shared.processing_message for i in range(chat_generation_attempts): - for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name2}:"): - reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True) + for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]): + reply, next_character_found = extract_message_from_reply(reply, name1, name2, check) yield reply if next_character_found: break diff --git a/modules/text_generation.py b/modules/text_generation.py index e738cb21..fd017e2c 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -99,25 +99,37 @@ def set_manual_seed(seed): if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) -def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_string=None): +def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]): clear_torch_cache() set_manual_seed(seed) t0 = time.time() + original_question = question + if not (shared.args.chat or shared.args.cai_chat): + question = apply_extensions(question, "input") + if shared.args.verbose: + print(f"\n\n{question}\n--------------------\n") + # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier if shared.is_RWKV: try: if shared.args.no_stream: reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) + if not (shared.args.chat or shared.args.cai_chat): + reply = original_question + apply_extensions(reply, "output") yield formatted_outputs(reply, shared.model_name) else: if not (shared.args.chat or shared.args.cai_chat): yield formatted_outputs(question, shared.model_name) + # RWKV has proper streaming, which is very nice. # No need to generate 8 tokens at a time. for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k): + if not (shared.args.chat or shared.args.cai_chat): + reply = original_question + apply_extensions(reply, "output") yield formatted_outputs(reply, shared.model_name) + except Exception: traceback.print_exc() finally: @@ -127,12 +139,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)") return - original_question = question - if not (shared.args.chat or shared.args.cai_chat): - question = apply_extensions(question, "input") - if shared.args.verbose: - print(f"\n\n{question}\n--------------------\n") - input_ids = encode(question, max_new_tokens) original_input_ids = input_ids output = input_ids[0] @@ -142,9 +148,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if eos_token is not None: eos_token_ids.append(int(encode(eos_token)[0][-1])) stopping_criteria_list = transformers.StoppingCriteriaList() - if stopping_string is not None: - # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py - t = encode(stopping_string, 0, add_special_tokens=False) + if type(stopping_strings) is list and len(stopping_strings) > 0: + t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings] stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0]))) generate_params = {} @@ -195,12 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) yield formatted_outputs(reply, shared.model_name) @@ -223,12 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi for output in generator: if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) if output[-1] in eos_token_ids: break @@ -244,12 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi output = shared.model.generate(**generate_params)[0] if shared.soft_prompt: output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(original_input_ids[0]) + reply = decode(output[-new_tokens:]) if not (shared.args.chat or shared.args.cai_chat): - new_tokens = len(output) - len(original_input_ids[0]) - reply = decode(output[-new_tokens:]) reply = original_question + apply_extensions(reply, "output") - else: - reply = decode(output) if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)): break