From ad6b699503eeabcad141efb6172ff43dc1976522 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:02:17 +1100
Subject: [PATCH 1/9] Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.
---
 extensions/silero_tts/requirements.txt |  1 +
 extensions/silero_tts/script.py        | 60 +++++++++++++++++++++++---
 modules/shared.py                      |  1 +
 modules/text_generation.py             | 11 ++++-
 requirements.txt                       |  1 +
 5 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index f2f0bff5..b4444306 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,3 +4,4 @@ pydub
 PyYAML
 torch
 torchaudio
+simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index f697d0e2..03319dbf 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,20 +4,36 @@ from pathlib import Path
 import gradio as gr
 import torch
 
+import modules.shared as shared
+import simpleaudio as sa
+
 torch._C._jit_set_profiling_mode(False)
 
 params = {
     'activate': True,
-    'speaker': 'en_56',
+    'speaker': 'en_5',
     'language': 'en',
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
+    'max_wavs': 20,
+    'play_audio': True,
+    'show_text': True,
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 wav_idx = 0
 
+table = str.maketrans({
+    "<": "&lt;",
+    ">": "&gt;",
+    "&": "&amp;",
+    "'": "&apos;",
+    '"': "&quot;",
+})
+def xmlesc(txt):
+    return txt.translate(table)
+
 def load_model():
     model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
     model.to(params['device'])
@@ -58,20 +74,45 @@ def output_modifier(string):
     if params['activate'] == False:
         return string
 
+    orig_string = string
     string = remove_surrounded_chars(string)
     string = string.replace('"', '')
     string = string.replace('“', '')
     string = string.replace('\n', ' ')
     string = string.strip()
 
+    auto_playable=True
     if string == '':
-        string = 'empty reply, try regenerating'
+            string = 'empty reply, try regenerating'
+            auto_playable=False
+            
 
+    #x-slow, slow, medium, fast, x-fast
+    #x-low, low, medium, high, x-high
+    #prosody='<prosody rate="fast" pitch="medium">'
+    prosody='<prosody rate="fast">'
+    string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
+        
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    audio = model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-
+    audio = model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
     string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
-    wav_idx += 1
+    
+    #reset if too many wavs. set max to -1 for unlimited.
+    if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+        #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+        if not shared.still_streaming:
+            wav_idx += 1
+    else:
+        wav_idx = 0
+        
+    if params['show_text']:
+        string+='\n\n'+orig_string
+    
+    #if params['play_audio'] == True and auto_playable and shared.stop_everything:
+    if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+        stop_autoplay()
+        wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
+        wave_obj.play()
 
     return string
 
@@ -84,11 +125,20 @@ def bot_prefix_modifier(string):
 
     return string
 
+def stop_autoplay():
+    sa.stop_all()
+
 def ui():
     # Gradio elements
     activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+    show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+    play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+    stop_audio = gr.Button("Stop Auto-Play")
     voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
+    play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
+    stop_audio.click(stop_autoplay)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
diff --git a/modules/shared.py b/modules/shared.py
index e9dfdaa2..90adb320 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -12,6 +12,7 @@ is_LLaMA = False
 history = {'internal': [], 'visible': []}
 character = 'None'
 stop_everything = False
+still_streaming = False
 
 # UI elements (buttons, sliders, HTML, etc)
 gradio = {}
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f9082a31..c9f4fc6a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -182,6 +182,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     # Generate the reply 8 tokens at a time
     else:
         yield formatted_outputs(original_question, shared.model_name)
+        shared.still_streaming = True
         for i in tqdm(range(max_new_tokens//8+1)):
             with torch.no_grad():
                 output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
@@ -191,8 +192,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             reply = decode(output)
             if not (shared.args.chat or shared.args.cai_chat):
                 reply = original_question + apply_extensions(reply[len(question):], "output")
-            yield formatted_outputs(reply, shared.model_name)
-
+            
             if not shared.args.flexgen:
                 if output[-1] == n:
                     break
@@ -201,6 +201,13 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
                     break
                 input_ids = np.reshape(output, (1, output.shape[0]))
+                
+            #Mid-stream yield, ran if no breaks
+            yield formatted_outputs(reply, shared.model_name)
 
             if shared.soft_prompt:
                 inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
+                
+        #Stream finished from max tokens or break. Do final yield.
+        shared.still_streaming = False
+        yield formatted_outputs(reply, shared.model_name)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 55aeb8fd..48ca1e4e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ numpy
 rwkv==0.0.6
 safetensors==0.2.8
 git+https://github.com/huggingface/transformers
+tensorboard

From 738be6dd59a6f9c2ee215093675f2d55111d89ca Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:25:55 +1100
Subject: [PATCH 2/9] Fix merge errors and unlimited wav bug

---
 extensions/silero_tts/script.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 53bd554c..eaf56159 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -93,11 +93,11 @@ def output_modifier(string):
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
         
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+    model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
     string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
     
     #reset if too many wavs. set max to -1 for unlimited.
-    if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+    if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
         #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
         if not shared.still_streaming:
             wav_idx += 1

From a2b5383398adc6da5c46811179bfadaefa5e23f7 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Thu, 9 Mar 2023 10:48:44 +1100
Subject: [PATCH 3/9] Merge in audio generation only on text stream finish.,
 postpone audioblock autoplay

- Keeping simpleaudio until audio block "autoplay" doesn't play previous messages
- Only generate audio for finished messages
- Better name for autoplay, clean up comments
- set default to unlimited wav files. Still a few bugs when wav id resets

Co-Authored-By: Christoph Hess <9931495+ChristophHess@users.noreply.github.com>
---
 extensions/silero_tts/script.py | 34 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index eaf56159..334b02b9 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -15,14 +15,15 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'max_wavs': 20,
-    'play_audio': True,
+    'max_wavs': -1,
+    'autoplay': True,
     'show_text': True,
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 wav_idx = 0
 
+#Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
     "<": "&lt;",
     ">": "&gt;",
@@ -88,27 +89,32 @@ def output_modifier(string):
 
     #x-slow, slow, medium, fast, x-fast
     #x-low, low, medium, high, x-high
-    #prosody='<prosody rate="fast" pitch="medium">'
-    prosody='<prosody rate="fast">'
+    prosody='<prosody rate="medium" pitch="medium">'
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
         
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-    string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
+    autoplay_str = ''
+    if not shared.still_streaming:
+        model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+        #diabled until autoplay doesn't run on previous messages
+        #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
+        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay_str}></audio>\n\n'
+    else:
+        #placeholder so text doesnt shift around so much
+        string =f'<audio controls {autoplay_str}></audio>\n\n'
     
     #reset if too many wavs. set max to -1 for unlimited.
     if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
-        #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+        #only increment if starting a new stream, else replace during streaming.
         if not shared.still_streaming:
             wav_idx += 1
     else:
         wav_idx = 0
-        
+
     if params['show_text']:
-        string+='\n\n'+orig_string
-    
-    #if params['play_audio'] == True and auto_playable and shared.stop_everything:
-    if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+        string+=orig_string
+
+    if params['autoplay'] == True and auto_playable and not shared.still_streaming:
         stop_autoplay()
         wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
         wave_obj.play()
@@ -131,13 +137,13 @@ def ui():
     # Gradio elements
     activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
     show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-    play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+    autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
     stop_audio = gr.Button("Stop Auto-Play")
     voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
     stop_audio.click(stop_autoplay)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)

From 0dfac4b777009d415d848c2f0bc718ec1bbac7e5 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:34:59 +1100
Subject: [PATCH 4/9] Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID
---
 extensions/silero_tts/requirements.txt |  1 -
 extensions/silero_tts/script.py        | 79 +++++++++++++-------------
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index b4444306..f2f0bff5 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,4 +4,3 @@ pydub
 PyYAML
 torch
 torchaudio
-simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 334b02b9..b66963e2 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,7 +4,6 @@ import gradio as gr
 import torch
 
 import modules.shared as shared
-import simpleaudio as sa
 
 torch._C._jit_set_profiling_mode(False)
 
@@ -15,13 +14,16 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'max_wavs': -1,
-    'autoplay': True,
     'show_text': True,
+    'autoplay': True,
+    'voice_pitch': 'medium',
+    'voice_speed': 'medium',
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
-wav_idx = 0
+voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
+voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
+last_msg_id = 0
 
 #Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
@@ -55,6 +57,14 @@ def input_modifier(string):
     This function is applied to your text inputs before
     they are fed into the model.
     """
+    #remove autoplay from previous
+    if len(shared.history['internal'])>0:
+        [text, reply] = shared.history['internal'][-1]
+        [visible_text, visible_reply] = shared.history['visible'][-1]
+        rep_clean = reply.replace('controls autoplay>','controls>')
+        vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
+        shared.history['internal'][-1] = [text, rep_clean]
+        shared.history['visible'][-1] = [visible_text, vis_rep_clean]
 
     return string
 
@@ -63,7 +73,7 @@ def output_modifier(string):
     This function is applied to the model outputs.
     """
 
-    global wav_idx, model, current_params
+    global model, current_params
 
     for i in params:
         if params[i] != current_params[i]:
@@ -81,44 +91,31 @@ def output_modifier(string):
     string = string.replace('\n', ' ')
     string = string.strip()
 
-    auto_playable=True
+    silent_string = False #Used to prevent unnecessary audio file generation
     if string == '':
             string = 'empty reply, try regenerating'
-            auto_playable=False
-            
+            silent_string = True
 
     #x-slow, slow, medium, fast, x-fast
     #x-low, low, medium, high, x-high
-    prosody='<prosody rate="medium" pitch="medium">'
+    pitch = params['voice_pitch']
+    speed = params['voice_speed']
+    prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
-        
-    output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    autoplay_str = ''
-    if not shared.still_streaming:
+
+    current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+    output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
+    if not shared.still_streaming and not silent_string:
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-        #diabled until autoplay doesn't run on previous messages
-        #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
-        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay_str}></audio>\n\n'
+        string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
     else:
-        #placeholder so text doesnt shift around so much
-        string =f'<audio controls {autoplay_str}></audio>\n\n'
-    
-    #reset if too many wavs. set max to -1 for unlimited.
-    if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
-        #only increment if starting a new stream, else replace during streaming.
-        if not shared.still_streaming:
-            wav_idx += 1
-    else:
-        wav_idx = 0
+        #placeholder so text doesn't shift around so much
+        string ='<audio controls></audio>\n\n'
 
     if params['show_text']:
+        #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
         string+=orig_string
 
-    if params['autoplay'] == True and auto_playable and not shared.still_streaming:
-        stop_autoplay()
-        wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
-        wave_obj.play()
-
     return string
 
 def bot_prefix_modifier(string):
@@ -130,20 +127,20 @@ def bot_prefix_modifier(string):
 
     return string
 
-def stop_autoplay():
-    sa.stop_all()
-
 def ui():
     # Gradio elements
-    activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
-    show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-    autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
-    stop_audio = gr.Button("Stop Auto-Play")
-    voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+    with gr.Accordion("Silero TTS"):
+        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+        show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
+        voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+        v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
+        v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
-    stop_audio.click(stop_autoplay)
+    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
+    v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
+    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)

From b8f7d34c1df5b12e60491e4c8a6494d5e6aec20e Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 17:05:09 +1100
Subject: [PATCH 5/9] Undo changes to requirements

needing to manually install tensorboard might be a windows-only problem. Can be easily solved manually.
---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a8a6eada..47c56a45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,5 @@ gradio==3.18.0
 numpy
 rwkv==0.1.0
 safetensors==0.2.8
-tensorboard
 sentencepiece
 git+https://github.com/oobabooga/transformers@llama_push

From 8f8da6707d7e71c2eef01c2d33ca6623cebf080c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 11:17:13 -0300
Subject: [PATCH 6/9] Minor style changes to silero_tts

---
 extensions/silero_tts/script.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index b66963e2..7e63d8b7 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -14,18 +14,19 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'show_text': True,
+    'show_text': False,
     'autoplay': True,
     'voice_pitch': 'medium',
     'voice_speed': 'medium',
 }
+
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
 voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
 last_msg_id = 0
 
-#Used for making text xml compatible, needed for voice pitch and speed control
+# Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
     "<": "&lt;",
     ">": "&gt;",
@@ -33,6 +34,7 @@ table = str.maketrans({
     "'": "&apos;",
     '"': "&quot;",
 })
+
 def xmlesc(txt):
     return txt.translate(table)
 
@@ -57,7 +59,8 @@ def input_modifier(string):
     This function is applied to your text inputs before
     they are fed into the model.
     """
-    #remove autoplay from previous
+
+    # Remove autoplay from previous
     if len(shared.history['internal'])>0:
         [text, reply] = shared.history['internal'][-1]
         [visible_text, visible_reply] = shared.history['visible'][-1]
@@ -91,30 +94,30 @@ def output_modifier(string):
     string = string.replace('\n', ' ')
     string = string.strip()
 
-    silent_string = False #Used to prevent unnecessary audio file generation
+    silent_string = False # Used to prevent unnecessary audio file generation
     if string == '':
-            string = 'empty reply, try regenerating'
-            silent_string = True
+        string = 'empty reply, try regenerating'
+        silent_string = True
 
-    #x-slow, slow, medium, fast, x-fast
-    #x-low, low, medium, high, x-high
+    # x-slow, slow, medium, fast, x-fast
+    # x-low, low, medium, high, x-high
     pitch = params['voice_pitch']
     speed = params['voice_speed']
     prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
-    string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
+    string = '<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
 
-    current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+    current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
     output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
     if not shared.still_streaming and not silent_string:
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
         string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
     else:
-        #placeholder so text doesn't shift around so much
-        string ='<audio controls></audio>\n\n'
+        # Placeholder so text doesn't shift around so much
+        string = '<audio controls></audio>\n\n'
 
     if params['show_text']:
-        #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
-        string+=orig_string
+        #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
+        string += orig_string
 
     return string
 

From d4afed4e44a748c22d9fa97edb3f818ae8af191f Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sun, 12 Mar 2023 17:56:57 +1100
Subject: [PATCH 7/9] Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.
---
 extensions/silero_tts/script.py | 89 ++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 17 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 7e63d8b7..1a60c901 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -2,8 +2,10 @@ from pathlib import Path
 
 import gradio as gr
 import torch
-
+import time
+import re
 import modules.shared as shared
+import modules.chat as chat
 
 torch._C._jit_set_profiling_mode(False)
 
@@ -54,19 +56,57 @@ def remove_surrounded_chars(string):
             new_string += char
     return new_string
 
+def remove_tts_from_history():
+    suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+    for i, entry in enumerate(shared.history['internal']):
+        reply = entry[1]
+        reply = re.sub("(<USER>|<user>|{{user}})", shared.settings[f'name1{suffix}'], reply)
+        if shared.args.chat:
+            reply = reply.replace('\n', '<br>')
+        shared.history['visible'][i][1] = reply
+
+    if shared.args.cai_chat:
+        return chat.generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name1{suffix}'], shared.character)
+    else:
+        return shared.history['visible']
+
+def toggle_text_in_history():
+    suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+    audio_str='\n\n' # The '\n\n' used after </audio>
+    if shared.args.chat:
+         audio_str='<br><br>'
+
+    if params['show_text']==True:
+        #for i, entry in enumerate(shared.history['internal']):
+        for i, entry in enumerate(shared.history['visible']):
+            vis_reply = entry[1]
+            if vis_reply.startswith('<audio'):
+                reply = shared.history['internal'][i][1]
+                reply = re.sub("(<USER>|<user>|{{user}})", shared.settings[f'name1{suffix}'], reply)
+                if shared.args.chat:
+                    reply = reply.replace('\n', '<br>')
+                shared.history['visible'][i][1] = vis_reply.split(audio_str,1)[0]+audio_str+reply
+    else:
+        for i, entry in enumerate(shared.history['visible']):
+            vis_reply = entry[1]
+            if vis_reply.startswith('<audio'):
+                shared.history['visible'][i][1] = vis_reply.split(audio_str,1)[0]+audio_str
+
+    if shared.args.cai_chat:
+        return chat.generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name1{suffix}'], shared.character)
+    else:
+        return shared.history['visible']
+
 def input_modifier(string):
     """
     This function is applied to your text inputs before
     they are fed into the model.
     """
 
-    # Remove autoplay from previous
-    if len(shared.history['internal'])>0:
-        [text, reply] = shared.history['internal'][-1]
+    # Remove autoplay from previous chat history
+    if (shared.args.chat or shared.args.cai_chat)and len(shared.history['internal'])>0:
         [visible_text, visible_reply] = shared.history['visible'][-1]
-        rep_clean = reply.replace('controls autoplay>','controls>')
         vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
-        shared.history['internal'][-1] = [text, rep_clean]
         shared.history['visible'][-1] = [visible_text, vis_rep_clean]
 
     return string
@@ -99,24 +139,21 @@ def output_modifier(string):
         string = 'empty reply, try regenerating'
         silent_string = True
 
-    # x-slow, slow, medium, fast, x-fast
-    # x-low, low, medium, high, x-high
     pitch = params['voice_pitch']
     speed = params['voice_speed']
     prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
     string = '<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
 
-    current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
-    output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
     if not shared.still_streaming and not silent_string:
+        output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{int(time.time())}.wav')
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-        string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
+        autoplay_str = ' autoplay' if params['autoplay'] else ''
+        string = f'<audio src="file/{output_file.as_posix()}" controls{autoplay_str}></audio>\n\n'
     else:
         # Placeholder so text doesn't shift around so much
         string = '<audio controls></audio>\n\n'
 
     if params['show_text']:
-        #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
         string += orig_string
 
     return string
@@ -133,16 +170,34 @@ def bot_prefix_modifier(string):
 def ui():
     # Gradio elements
     with gr.Accordion("Silero TTS"):
-        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+        with gr.Row():
+            activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+            autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
         show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
         voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
-        v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
-        v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
+        with gr.Row():
+            v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
+            v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
+        with gr.Row():
+            convert = gr.Button('Permanently replace chat history audio with message text')
+            convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
+            convert_cancel = gr.Button('Cancel', visible=False)
+
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda :[gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+    convert_confirm.click(remove_tts_from_history, [], shared.gradio['display'])
+    convert_confirm.click(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
+    convert_cancel.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+
+    # Toggle message text in history
+    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
+    show_text.change(toggle_text_in_history, [], shared.gradio['display'])
+    show_text.change(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
     autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
     v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)

From 9276af3561df4d6b25cadc85dd9e51fe167fe807 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sun, 12 Mar 2023 19:06:24 +1100
Subject: [PATCH 8/9] clean up

---
 .idea/workspace.xml | 64 ---------------------------------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 .idea/workspace.xml

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index 404920a8..00000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,64 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="edbf3935-4476-45aa-aea0-f1e7cbcf4b9a" name="Changes" comment="">
-      <change afterPath="$PROJECT_DIR$/extensions/llama_prompts/script.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/modules/callbacks.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/RWKV.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/RWKV.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/chat.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/chat.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/shared.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/shared.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/stopping_criteria.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/text_generation.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/text_generation.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/server.py" beforeDir="false" afterPath="$PROJECT_DIR$/server.py" afterDir="false" />
-    </list>
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProjectId" id="2MtdH03e5QdbSP16WYYfDkhyFUC" />
-  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
-  <component name="ProjectViewState">
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "ASKED_SHARE_PROJECT_CONFIGURATION_FILES": "true",
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true"
-  }
-}]]></component>
-  <component name="RunManager">
-    <configuration default="true" type="JetRunConfigurationType">
-      <module name="text-generation-webui" />
-      <method v="2">
-        <option name="Make" enabled="true" />
-      </method>
-    </configuration>
-    <configuration default="true" type="KotlinStandaloneScriptRunConfigurationType">
-      <module name="text-generation-webui" />
-      <option name="filePath" />
-      <method v="2">
-        <option name="Make" enabled="true" />
-      </method>
-    </configuration>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="edbf3935-4476-45aa-aea0-f1e7cbcf4b9a" name="Changes" comment="" />
-      <created>1678590722207</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1678590722207</updated>
-    </task>
-    <servers />
-  </component>
-</project>
\ No newline at end of file

From 4066ab4c0ca608bc4f95f50fe7c7f11334192946 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 13:36:18 -0300
Subject: [PATCH 9/9] Reorder the imports

---
 extensions/silero_tts/script.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 1a60c901..62d4b441 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -1,11 +1,12 @@
+import re
+import time
 from pathlib import Path
 
 import gradio as gr
 import torch
-import time
-import re
-import modules.shared as shared
+
 import modules.chat as chat
+import modules.shared as shared
 
 torch._C._jit_set_profiling_mode(False)