mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-26 01:30:20 +01:00
Refactor text-generation.py a bit
This commit is contained in:
parent
2f4f124132
commit
1a0c12c6f2
@ -113,9 +113,11 @@ def set_manual_seed(seed):
|
|||||||
seed = int(seed)
|
seed = int(seed)
|
||||||
if seed == -1:
|
if seed == -1:
|
||||||
seed = random.randint(1, 2**31)
|
seed = random.randint(1, 2**31)
|
||||||
|
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
return seed
|
return seed
|
||||||
|
|
||||||
|
|
||||||
@ -123,8 +125,41 @@ def stop_everything_event():
|
|||||||
shared.stop_everything = True
|
shared.stop_everything = True
|
||||||
|
|
||||||
|
|
||||||
def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
def get_generate_params(state):
|
||||||
|
generate_params = {}
|
||||||
|
|
||||||
|
# Models that are not on transformers
|
||||||
|
if shared.model_type in ['rwkv', 'llamacpp']:
|
||||||
|
generate_params['token_count'] = state['max_new_tokens']
|
||||||
|
for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
|
||||||
|
generate_params[k] = state[k]
|
||||||
|
else:
|
||||||
|
# FlexGen
|
||||||
|
if shared.args.flexgen:
|
||||||
|
for k in ['max_new_tokens', 'do_sample', 'temperature']:
|
||||||
|
generate_params[k] = state[k]
|
||||||
|
|
||||||
|
if not shared.args.no_stream:
|
||||||
|
generate_params['max_new_tokens'] = 8
|
||||||
|
|
||||||
|
# transformers
|
||||||
|
else:
|
||||||
|
for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']:
|
||||||
|
generate_params[k] = state[k]
|
||||||
|
|
||||||
|
if state['ban_eos_token']:
|
||||||
|
generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
|
||||||
|
|
||||||
|
if shared.args.no_cache:
|
||||||
|
generate_params.update({'use_cache': False})
|
||||||
|
|
||||||
|
if shared.args.deepspeed:
|
||||||
|
generate_params.update({'synced_gpus': True})
|
||||||
|
|
||||||
|
return generate_params
|
||||||
|
|
||||||
|
|
||||||
|
def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
||||||
if shared.model_name == 'None' or shared.model is None:
|
if shared.model_name == 'None' or shared.model is None:
|
||||||
print("No model is loaded! Select one in the Model tab.")
|
print("No model is loaded! Select one in the Model tab.")
|
||||||
yield formatted_outputs(question, shared.model_name)
|
yield formatted_outputs(question, shared.model_name)
|
||||||
@ -133,40 +168,37 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
|||||||
clear_torch_cache()
|
clear_torch_cache()
|
||||||
seed = set_manual_seed(state['seed'])
|
seed = set_manual_seed(state['seed'])
|
||||||
shared.stop_everything = False
|
shared.stop_everything = False
|
||||||
generate_params = {}
|
generate_params = get_generate_params(state)
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Preparing the input
|
||||||
original_question = question
|
original_question = question
|
||||||
if not shared.is_chat():
|
if not shared.is_chat():
|
||||||
question = apply_extensions('input', question)
|
question = apply_extensions('input', question)
|
||||||
|
|
||||||
# These models are not part of Hugging Face, so we handle them
|
# If the model is not on transformers, handle it separately and end this
|
||||||
# separately and terminate the function call earlier
|
# function call earlier.
|
||||||
if shared.model_type in ['rwkv', 'llamacpp']:
|
if shared.model_type in ['rwkv', 'llamacpp']:
|
||||||
|
|
||||||
if shared.args.verbose:
|
if shared.args.verbose:
|
||||||
print(f'\n\n{question}\n--------------------\n')
|
print(f'\n\n{question}\n--------------------\n')
|
||||||
|
|
||||||
for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
|
|
||||||
generate_params[k] = state[k]
|
|
||||||
generate_params['token_count'] = state['max_new_tokens']
|
|
||||||
try:
|
try:
|
||||||
if shared.args.no_stream:
|
if shared.args.no_stream:
|
||||||
reply = shared.model.generate(context=question, **generate_params)
|
reply = shared.model.generate(context=question, **generate_params)
|
||||||
output = original_question + reply
|
output = original_question + reply
|
||||||
if not shared.is_chat():
|
if not shared.is_chat():
|
||||||
reply = original_question + apply_extensions('output', reply)
|
reply = original_question + apply_extensions('output', reply)
|
||||||
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
else:
|
else:
|
||||||
if not shared.is_chat():
|
if not shared.is_chat():
|
||||||
yield formatted_outputs(question, shared.model_name)
|
yield formatted_outputs(question, shared.model_name)
|
||||||
|
|
||||||
# RWKV has proper streaming, which is very nice.
|
|
||||||
# No need to generate 8 tokens at a time.
|
|
||||||
for reply in shared.model.generate_with_streaming(context=question, **generate_params):
|
for reply in shared.model.generate_with_streaming(context=question, **generate_params):
|
||||||
output = original_question + reply
|
output = original_question + reply
|
||||||
if not shared.is_chat():
|
if not shared.is_chat():
|
||||||
reply = original_question + apply_extensions('output', reply)
|
reply = original_question + apply_extensions('output', reply)
|
||||||
|
|
||||||
yield formatted_outputs(reply, shared.model_name)
|
yield formatted_outputs(reply, shared.model_name)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -178,18 +210,19 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
|||||||
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Encode the input
|
||||||
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
|
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
|
||||||
output = input_ids[0]
|
output = input_ids[0]
|
||||||
|
cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
|
||||||
if shared.args.verbose:
|
if shared.args.verbose:
|
||||||
print(f'\n\n{decode(input_ids[0], state["skip_special_tokens"])}\n--------------------\n')
|
print(f'\n\n{decode(input_ids[0], state["skip_special_tokens"])}\n--------------------\n')
|
||||||
|
|
||||||
cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
|
# Find the eos tokens
|
||||||
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
|
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
|
||||||
if eos_token is not None:
|
if eos_token is not None:
|
||||||
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
eos_token_ids.append(int(encode(eos_token)[0][-1]))
|
||||||
|
|
||||||
# Handling the stopping strings
|
# Create the StoppingCriteriaList with the stopping strings
|
||||||
stopping_criteria_list = transformers.StoppingCriteriaList()
|
stopping_criteria_list = transformers.StoppingCriteriaList()
|
||||||
for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
|
for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
|
||||||
if type(st) is list and len(st) > 0:
|
if type(st) is list and len(st) > 0:
|
||||||
@ -197,24 +230,14 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
|||||||
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=len(input_ids[0])))
|
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=len(input_ids[0])))
|
||||||
break
|
break
|
||||||
|
|
||||||
if not shared.args.flexgen:
|
# Update generate_params with the eos token and the stopping strings
|
||||||
for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']:
|
if shared.args.flexgen:
|
||||||
generate_params[k] = state[k]
|
generate_params['stop'] = eos_token_ids[-1]
|
||||||
|
else:
|
||||||
generate_params['eos_token_id'] = eos_token_ids
|
generate_params['eos_token_id'] = eos_token_ids
|
||||||
generate_params['stopping_criteria'] = stopping_criteria_list
|
generate_params['stopping_criteria'] = stopping_criteria_list
|
||||||
if state['ban_eos_token']:
|
|
||||||
generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
|
|
||||||
else:
|
|
||||||
for k in ['max_new_tokens', 'do_sample', 'temperature']:
|
|
||||||
generate_params[k] = state[k]
|
|
||||||
generate_params['stop'] = eos_token_ids[-1]
|
|
||||||
if not shared.args.no_stream:
|
|
||||||
generate_params['max_new_tokens'] = 8
|
|
||||||
|
|
||||||
if shared.args.no_cache:
|
# Add the encoded tokens to generate_params
|
||||||
generate_params.update({'use_cache': False})
|
|
||||||
if shared.args.deepspeed:
|
|
||||||
generate_params.update({'synced_gpus': True})
|
|
||||||
if shared.soft_prompt:
|
if shared.soft_prompt:
|
||||||
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
|
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
|
||||||
question, filler_input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, filler_input_ids, inputs_embeds)
|
question, filler_input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, filler_input_ids, inputs_embeds)
|
||||||
|
Loading…
Reference in New Issue
Block a user