From 1a0c12c6f203d28b865346a767cd30720737a5ca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 24 Apr 2023 19:24:12 -0300
Subject: [PATCH] Refactor text-generation.py a bit

---
 modules/text_generation.py | 79 ++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 032fc84c..936ec647 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -113,9 +113,11 @@ def set_manual_seed(seed):
     seed = int(seed)
     if seed == -1:
         seed = random.randint(1, 2**31)
+
     torch.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(seed)
+
     return seed
 
 
@@ -123,8 +125,41 @@ def stop_everything_event():
     shared.stop_everything = True
 
 
-def generate_reply(question, state, eos_token=None, stopping_strings=[]):
+def get_generate_params(state):
+    generate_params = {}
 
+    # Models that are not on transformers
+    if shared.model_type in ['rwkv', 'llamacpp']:
+        generate_params['token_count'] = state['max_new_tokens']
+        for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
+            generate_params[k] = state[k]
+    else:
+        # FlexGen
+        if shared.args.flexgen:
+            for k in ['max_new_tokens', 'do_sample', 'temperature']:
+                generate_params[k] = state[k]
+
+            if not shared.args.no_stream:
+                generate_params['max_new_tokens'] = 8
+
+        # transformers
+        else:
+            for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']:
+                generate_params[k] = state[k]
+
+            if state['ban_eos_token']:
+                generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
+
+            if shared.args.no_cache:
+                generate_params.update({'use_cache': False})
+
+            if shared.args.deepspeed:
+                generate_params.update({'synced_gpus': True})
+
+    return generate_params
+
+
+def generate_reply(question, state, eos_token=None, stopping_strings=[]):
     if shared.model_name == 'None' or shared.model is None:
         print("No model is loaded! Select one in the Model tab.")
         yield formatted_outputs(question, shared.model_name)
@@ -133,40 +168,37 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
     clear_torch_cache()
     seed = set_manual_seed(state['seed'])
     shared.stop_everything = False
-    generate_params = {}
+    generate_params = get_generate_params(state)
     t0 = time.time()
 
+    # Preparing the input
     original_question = question
     if not shared.is_chat():
         question = apply_extensions('input', question)
 
-    # These models are not part of Hugging Face, so we handle them
-    # separately and terminate the function call earlier
+    # If the model is not on transformers, handle it separately and end this
+    # function call earlier.
     if shared.model_type in ['rwkv', 'llamacpp']:
-
         if shared.args.verbose:
             print(f'\n\n{question}\n--------------------\n')
 
-        for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
-            generate_params[k] = state[k]
-        generate_params['token_count'] = state['max_new_tokens']
         try:
             if shared.args.no_stream:
                 reply = shared.model.generate(context=question, **generate_params)
                 output = original_question + reply
                 if not shared.is_chat():
                     reply = original_question + apply_extensions('output', reply)
+
                 yield formatted_outputs(reply, shared.model_name)
             else:
                 if not shared.is_chat():
                     yield formatted_outputs(question, shared.model_name)
 
-                # RWKV has proper streaming, which is very nice.
-                # No need to generate 8 tokens at a time.
                 for reply in shared.model.generate_with_streaming(context=question, **generate_params):
                     output = original_question + reply
                     if not shared.is_chat():
                         reply = original_question + apply_extensions('output', reply)
+
                     yield formatted_outputs(reply, shared.model_name)
 
         except Exception:
@@ -178,18 +210,19 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
             print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
             return
 
+    # Encode the input
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
-
+    cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
     if shared.args.verbose:
         print(f'\n\n{decode(input_ids[0], state["skip_special_tokens"])}\n--------------------\n')
 
-    cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
+    # Find the eos tokens
     eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
     if eos_token is not None:
         eos_token_ids.append(int(encode(eos_token)[0][-1]))
 
-    # Handling the stopping strings
+    # Create the StoppingCriteriaList with the stopping strings
     stopping_criteria_list = transformers.StoppingCriteriaList()
     for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
         if type(st) is list and len(st) > 0:
@@ -197,24 +230,14 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
             stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=len(input_ids[0])))
             break
 
-    if not shared.args.flexgen:
-        for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']:
-            generate_params[k] = state[k]
+    # Update generate_params with the eos token and the stopping strings
+    if shared.args.flexgen:
+        generate_params['stop'] = eos_token_ids[-1]
+    else:
         generate_params['eos_token_id'] = eos_token_ids
         generate_params['stopping_criteria'] = stopping_criteria_list
-        if state['ban_eos_token']:
-            generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
-    else:
-        for k in ['max_new_tokens', 'do_sample', 'temperature']:
-            generate_params[k] = state[k]
-        generate_params['stop'] = eos_token_ids[-1]
-        if not shared.args.no_stream:
-            generate_params['max_new_tokens'] = 8
 
-    if shared.args.no_cache:
-        generate_params.update({'use_cache': False})
-    if shared.args.deepspeed:
-        generate_params.update({'synced_gpus': True})
+    # Add the encoded tokens to generate_params
     if shared.soft_prompt:
         inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
         question, filler_input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, filler_input_ids, inputs_embeds)