mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Stop the bot from talking for you in chat mode
This commit is contained in:
parent
9bf6ecf9e2
commit
4578e88ffd
@ -45,11 +45,11 @@ class RWKVModel:
|
|||||||
token_stop = token_stop
|
token_stop = token_stop
|
||||||
)
|
)
|
||||||
|
|
||||||
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
|
||||||
|
|
||||||
def generate_with_streaming(self, **kwargs):
|
def generate_with_streaming(self, **kwargs):
|
||||||
with Iteratorize(self.generate, kwargs, callback=None) as generator:
|
with Iteratorize(self.generate, kwargs, callback=None) as generator:
|
||||||
reply = kwargs['context']
|
reply = ''
|
||||||
for token in generator:
|
for token in generator:
|
||||||
reply += token
|
reply += token
|
||||||
yield reply
|
yield reply
|
||||||
|
@ -11,24 +11,22 @@ import modules.shared as shared
|
|||||||
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
# Copied from https://github.com/PygmalionAI/gradio-ui/
|
||||||
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
|
||||||
|
|
||||||
def __init__(self, sentinel_token_ids: torch.LongTensor,
|
def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
|
||||||
starting_idx: int):
|
|
||||||
transformers.StoppingCriteria.__init__(self)
|
transformers.StoppingCriteria.__init__(self)
|
||||||
self.sentinel_token_ids = sentinel_token_ids
|
self.sentinel_token_ids = sentinel_token_ids
|
||||||
self.starting_idx = starting_idx
|
self.starting_idx = starting_idx
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor,
|
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
|
||||||
_scores: torch.FloatTensor) -> bool:
|
|
||||||
for sample in input_ids:
|
for sample in input_ids:
|
||||||
trimmed_sample = sample[self.starting_idx:]
|
trimmed_sample = sample[self.starting_idx:]
|
||||||
# Can't unfold, output is still too tiny. Skip.
|
|
||||||
if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for window in trimmed_sample.unfold(
|
for i in range(len(self.sentinel_token_ids)):
|
||||||
0, self.sentinel_token_ids.shape[-1], 1):
|
# Can't unfold, output is still too tiny. Skip.
|
||||||
if torch.all(torch.eq(self.sentinel_token_ids, window)):
|
if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
|
||||||
return True
|
continue
|
||||||
|
for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
|
||||||
|
if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
class Stream(transformers.StoppingCriteria):
|
class Stream(transformers.StoppingCriteria):
|
||||||
|
@ -51,41 +51,31 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
|
|||||||
prompt = ''.join(rows)
|
prompt = ''.join(rows)
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
|
def extract_message_from_reply(reply, name1, name2, check):
|
||||||
next_character_found = False
|
next_character_found = False
|
||||||
|
|
||||||
asker = name1 if not impersonate else name2
|
|
||||||
replier = name2 if not impersonate else name1
|
|
||||||
|
|
||||||
previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
|
|
||||||
idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
|
|
||||||
idx = idx[max(len(previous_idx)-1, 0)]
|
|
||||||
|
|
||||||
if not impersonate:
|
|
||||||
reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
|
|
||||||
else:
|
|
||||||
reply = reply[idx + 1 + len(f"{replier}:"):]
|
|
||||||
|
|
||||||
if check:
|
if check:
|
||||||
lines = reply.split('\n')
|
lines = reply.split('\n')
|
||||||
reply = lines[0].strip()
|
reply = lines[0].strip()
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
next_character_found = True
|
next_character_found = True
|
||||||
else:
|
else:
|
||||||
idx = reply.find(f"\n{asker}:")
|
for string in [f"\n{name1}:", f"\n{name2}:"]:
|
||||||
if idx != -1:
|
idx = reply.find(string)
|
||||||
reply = reply[:idx]
|
if idx != -1:
|
||||||
next_character_found = True
|
reply = reply[:idx]
|
||||||
reply = fix_newlines(reply)
|
next_character_found = True
|
||||||
|
|
||||||
# If something like "\nYo" is generated just before "\nYou:"
|
# If something like "\nYo" is generated just before "\nYou:"
|
||||||
# is completed, trim it
|
# is completed, trim it
|
||||||
next_turn = f"\n{asker}:"
|
if not next_character_found:
|
||||||
for j in range(len(next_turn)-1, 0, -1):
|
for string in [f"\n{name1}:", f"\n{name2}:"]:
|
||||||
if reply[-j:] == next_turn[:j]:
|
for j in range(len(string)-1, 0, -1):
|
||||||
reply = reply[:-j]
|
if reply[-j:] == string[:j]:
|
||||||
break
|
reply = reply[:-j]
|
||||||
|
break
|
||||||
|
|
||||||
|
reply = fix_newlines(reply)
|
||||||
return reply, next_character_found
|
return reply, next_character_found
|
||||||
|
|
||||||
def stop_everything_event():
|
def stop_everything_event():
|
||||||
@ -127,10 +117,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
|
|||||||
# Generate
|
# Generate
|
||||||
reply = ''
|
reply = ''
|
||||||
for i in range(chat_generation_attempts):
|
for i in range(chat_generation_attempts):
|
||||||
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name1}:"):
|
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
|
||||||
|
|
||||||
# Extracting the reply
|
# Extracting the reply
|
||||||
reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
|
reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
|
||||||
visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
|
visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
|
||||||
visible_reply = apply_extensions(visible_reply, "output")
|
visible_reply = apply_extensions(visible_reply, "output")
|
||||||
if shared.args.chat:
|
if shared.args.chat:
|
||||||
@ -166,8 +156,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
|
|||||||
# Yield *Is typing...*
|
# Yield *Is typing...*
|
||||||
yield shared.processing_message
|
yield shared.processing_message
|
||||||
for i in range(chat_generation_attempts):
|
for i in range(chat_generation_attempts):
|
||||||
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name2}:"):
|
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
|
||||||
reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
|
reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
|
||||||
yield reply
|
yield reply
|
||||||
if next_character_found:
|
if next_character_found:
|
||||||
break
|
break
|
||||||
|
@ -99,25 +99,37 @@ def set_manual_seed(seed):
|
|||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_string=None):
|
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]):
|
||||||
clear_torch_cache()
|
clear_torch_cache()
|
||||||
set_manual_seed(seed)
|
set_manual_seed(seed)
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
|
original_question = question
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
question = apply_extensions(question, "input")
|
||||||
|
if shared.args.verbose:
|
||||||
|
print(f"\n\n{question}\n--------------------\n")
|
||||||
|
|
||||||
# These models are not part of Hugging Face, so we handle them
|
# These models are not part of Hugging Face, so we handle them
|
||||||
# separately and terminate the function call earlier
|
# separately and terminate the function call earlier
|
||||||
if shared.is_RWKV:
|
if shared.is_RWKV:
|
||||||
try:
|
try:
|
||||||
if shared.args.no_stream:
|
if shared.args.no_stream:
|
||||||
reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
|
reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
else:
|
else:
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
yield formatted_outputs(question, shared.model_name)
|
yield formatted_outputs(question, shared.model_name)
|
||||||
|
|
||||||
# RWKV has proper streaming, which is very nice.
|
# RWKV has proper streaming, which is very nice.
|
||||||
# No need to generate 8 tokens at a time.
|
# No need to generate 8 tokens at a time.
|
||||||
for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
|
for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
|
||||||
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
finally:
|
finally:
|
||||||
@ -127,12 +139,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
|
print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
|
||||||
return
|
return
|
||||||
|
|
||||||
original_question = question
|
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
|
||||||
question = apply_extensions(question, "input")
|
|
||||||
if shared.args.verbose:
|
|
||||||
print(f"\n\n{question}\n--------------------\n")
|
|
||||||
|
|
||||||
input_ids = encode(question, max_new_tokens)
|
input_ids = encode(question, max_new_tokens)
|
||||||
original_input_ids = input_ids
|
original_input_ids = input_ids
|
||||||
output = input_ids[0]
|
output = input_ids[0]
|
||||||
@ -142,9 +148,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
if eos_token is not None:
|
if eos_token is not None:
|
||||||
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
||||||
stopping_criteria_list = transformers.StoppingCriteriaList()
|
stopping_criteria_list = transformers.StoppingCriteriaList()
|
||||||
if stopping_string is not None:
|
if type(stopping_strings) is list and len(stopping_strings) > 0:
|
||||||
# Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
|
t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings]
|
||||||
t = encode(stopping_string, 0, add_special_tokens=False)
|
|
||||||
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
|
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
|
||||||
|
|
||||||
generate_params = {}
|
generate_params = {}
|
||||||
@ -195,12 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
|
|
||||||
|
new_tokens = len(output) - len(input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
new_tokens = len(output) - len(input_ids[0])
|
|
||||||
reply = decode(output[-new_tokens:])
|
|
||||||
reply = original_question + apply_extensions(reply, "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
else:
|
|
||||||
reply = decode(output)
|
|
||||||
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
@ -223,12 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
for output in generator:
|
for output in generator:
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
|
|
||||||
|
new_tokens = len(output) - len(input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
new_tokens = len(output) - len(input_ids[0])
|
|
||||||
reply = decode(output[-new_tokens:])
|
|
||||||
reply = original_question + apply_extensions(reply, "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
else:
|
|
||||||
reply = decode(output)
|
|
||||||
|
|
||||||
if output[-1] in eos_token_ids:
|
if output[-1] in eos_token_ids:
|
||||||
break
|
break
|
||||||
@ -244,12 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
|
|||||||
output = shared.model.generate(**generate_params)[0]
|
output = shared.model.generate(**generate_params)[0]
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
|
||||||
|
|
||||||
|
new_tokens = len(output) - len(original_input_ids[0])
|
||||||
|
reply = decode(output[-new_tokens:])
|
||||||
if not (shared.args.chat or shared.args.cai_chat):
|
if not (shared.args.chat or shared.args.cai_chat):
|
||||||
new_tokens = len(output) - len(original_input_ids[0])
|
|
||||||
reply = decode(output[-new_tokens:])
|
|
||||||
reply = original_question + apply_extensions(reply, "output")
|
reply = original_question + apply_extensions(reply, "output")
|
||||||
else:
|
|
||||||
reply = decode(output)
|
|
||||||
|
|
||||||
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
|
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
|
||||||
break
|
break
|
||||||
|
Loading…
Reference in New Issue
Block a user