From 9edb193defc567d9626e210426048a61554cbdb0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Dec 2023 00:00:40 -0300
Subject: [PATCH] Optimize HF text generation (#4814)

---
 modules/text_generation.py | 41 +++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 3a4c55b3..c5bfceb7 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -93,9 +93,10 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
                 last_update = time.time()
                 yield reply
 
-            # Limit updates to 24 per second to not stress low latency networks
+            # Limit updates to 24 or 5 per second to avoid lag
             else:
-                if cur_time - last_update > 0.041666666666666664:
+                min_update_interval = 0.2 if (shared.args.listen or shared.args.share) else 0.0417
+                if cur_time - last_update > min_update_interval:
                     last_update = cur_time
                     yield reply
 
@@ -218,20 +219,6 @@ def fix_galactica(s):
     return s
 
 
-def get_reply_from_output_ids(output_ids, input_ids, original_question, state, is_chat=False):
-    if shared.is_seq2seq:
-        reply = decode(output_ids, state['skip_special_tokens'])
-    else:
-        new_tokens = len(output_ids) - len(input_ids[0])
-        reply = decode(output_ids[-new_tokens:], state['skip_special_tokens'])
-        # Prevent LlamaTokenizer from skipping a space
-        if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0:
-            if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith('▁'):
-                reply = ' ' + reply
-
-    return reply
-
-
 def set_manual_seed(seed):
     seed = int(seed)
     if seed == -1:
@@ -242,6 +229,7 @@ def set_manual_seed(seed):
         torch.cuda.manual_seed_all(seed)
     elif is_torch_xpu_available():
         torch.xpu.manual_seed_all(seed)
+
     return seed
 
 
@@ -274,6 +262,19 @@ def apply_stopping_strings(reply, all_stop_strings):
     return reply, stop_found
 
 
+def get_reply_from_output_ids(output_ids, state, starting_from=0):
+    if shared.is_seq2seq:
+        reply = decode(output_ids, state['skip_special_tokens'])
+    else:
+        reply = decode(output_ids[starting_from:], state['skip_special_tokens'])
+        # Prevent LlamaTokenizer from skipping a space
+        if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0:
+            if shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from])).startswith('▁'):
+                reply = ' ' + reply
+
+    return reply
+
+
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
     generate_params = {}
     for k in ['max_new_tokens', 'do_sample', 'temperature', 'temperature_last', 'top_p', 'min_p', 'typical_p', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'guidance_scale']:
@@ -341,7 +342,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
                 if cuda:
                     output = output.cuda()
 
-            yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
+            yield get_reply_from_output_ids(output, state, starting_from=len(input_ids[0]))
 
         # Stream the reply 1 token at a time.
         # This is based on the trick of using 'stopping_criteria' to create an iterator.
@@ -357,11 +358,15 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
                 return Iteratorize(generate_with_callback, [], kwargs, callback=None)
 
             with generate_with_streaming(**generate_params) as generator:
+                cumulative_reply = ''
+                starting_from = len(input_ids[0])
                 for output in generator:
                     if output[-1] in eos_token_ids:
                         break
 
-                    yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
+                    cumulative_reply += get_reply_from_output_ids(output, state, starting_from=starting_from)
+                    starting_from = len(output)
+                    yield cumulative_reply
 
     except Exception:
         traceback.print_exc()