mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 09:19:23 +01:00
Optimize HF text generation (#4814)
This commit is contained in:
parent
ac9f154bcc
commit
9edb193def
@ -93,9 +93,10 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
|||||||
last_update = time.time()
|
last_update = time.time()
|
||||||
yield reply
|
yield reply
|
||||||
|
|
||||||
# Limit updates to 24 per second to not stress low latency networks
|
# Limit updates to 24 or 5 per second to avoid lag
|
||||||
else:
|
else:
|
||||||
if cur_time - last_update > 0.041666666666666664:
|
min_update_interval = 0.2 if (shared.args.listen or shared.args.share) else 0.0417
|
||||||
|
if cur_time - last_update > min_update_interval:
|
||||||
last_update = cur_time
|
last_update = cur_time
|
||||||
yield reply
|
yield reply
|
||||||
|
|
||||||
@ -218,20 +219,6 @@ def fix_galactica(s):
|
|||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def get_reply_from_output_ids(output_ids, input_ids, original_question, state, is_chat=False):
|
|
||||||
if shared.is_seq2seq:
|
|
||||||
reply = decode(output_ids, state['skip_special_tokens'])
|
|
||||||
else:
|
|
||||||
new_tokens = len(output_ids) - len(input_ids[0])
|
|
||||||
reply = decode(output_ids[-new_tokens:], state['skip_special_tokens'])
|
|
||||||
# Prevent LlamaTokenizer from skipping a space
|
|
||||||
if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0:
|
|
||||||
if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith('▁'):
|
|
||||||
reply = ' ' + reply
|
|
||||||
|
|
||||||
return reply
|
|
||||||
|
|
||||||
|
|
||||||
def set_manual_seed(seed):
|
def set_manual_seed(seed):
|
||||||
seed = int(seed)
|
seed = int(seed)
|
||||||
if seed == -1:
|
if seed == -1:
|
||||||
@ -242,6 +229,7 @@ def set_manual_seed(seed):
|
|||||||
torch.cuda.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
torch.xpu.manual_seed_all(seed)
|
torch.xpu.manual_seed_all(seed)
|
||||||
|
|
||||||
return seed
|
return seed
|
||||||
|
|
||||||
|
|
||||||
@ -274,6 +262,19 @@ def apply_stopping_strings(reply, all_stop_strings):
|
|||||||
return reply, stop_found
|
return reply, stop_found
|
||||||
|
|
||||||
|
|
||||||
|
def get_reply_from_output_ids(output_ids, state, starting_from=0):
|
||||||
|
if shared.is_seq2seq:
|
||||||
|
reply = decode(output_ids, state['skip_special_tokens'])
|
||||||
|
else:
|
||||||
|
reply = decode(output_ids[starting_from:], state['skip_special_tokens'])
|
||||||
|
# Prevent LlamaTokenizer from skipping a space
|
||||||
|
if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0:
|
||||||
|
if shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from])).startswith('▁'):
|
||||||
|
reply = ' ' + reply
|
||||||
|
|
||||||
|
return reply
|
||||||
|
|
||||||
|
|
||||||
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
||||||
generate_params = {}
|
generate_params = {}
|
||||||
for k in ['max_new_tokens', 'do_sample', 'temperature', 'temperature_last', 'top_p', 'min_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
|
for k in ['max_new_tokens', 'do_sample', 'temperature', 'temperature_last', 'top_p', 'min_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
|
||||||
@ -341,7 +342,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
|
|||||||
if cuda:
|
if cuda:
|
||||||
output = output.cuda()
|
output = output.cuda()
|
||||||
|
|
||||||
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
|
yield get_reply_from_output_ids(output, state, starting_from=len(input_ids[0]))
|
||||||
|
|
||||||
# Stream the reply 1 token at a time.
|
# Stream the reply 1 token at a time.
|
||||||
# This is based on the trick of using 'stopping_criteria' to create an iterator.
|
# This is based on the trick of using 'stopping_criteria' to create an iterator.
|
||||||
@ -357,11 +358,15 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
|
|||||||
return Iteratorize(generate_with_callback, [], kwargs, callback=None)
|
return Iteratorize(generate_with_callback, [], kwargs, callback=None)
|
||||||
|
|
||||||
with generate_with_streaming(**generate_params) as generator:
|
with generate_with_streaming(**generate_params) as generator:
|
||||||
|
cumulative_reply = ''
|
||||||
|
starting_from = len(input_ids[0])
|
||||||
for output in generator:
|
for output in generator:
|
||||||
if output[-1] in eos_token_ids:
|
if output[-1] in eos_token_ids:
|
||||||
break
|
break
|
||||||
|
|
||||||
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
|
cumulative_reply += get_reply_from_output_ids(output, state, starting_from=starting_from)
|
||||||
|
starting_from = len(output)
|
||||||
|
yield cumulative_reply
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
Loading…
Reference in New Issue
Block a user