Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID
This commit is contained in:
Xan 2023-03-11 16:34:59 +11:00
parent a2b5383398
commit 0dfac4b777
2 changed files with 38 additions and 42 deletions

View File

@ -4,4 +4,3 @@ pydub
PyYAML PyYAML
torch torch
torchaudio torchaudio
simpleaudio

View File

@ -4,7 +4,6 @@ import gradio as gr
import torch import torch
import modules.shared as shared import modules.shared as shared
import simpleaudio as sa
torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_mode(False)
@ -15,13 +14,16 @@ params = {
'model_id': 'v3_en', 'model_id': 'v3_en',
'sample_rate': 48000, 'sample_rate': 48000,
'device': 'cpu', 'device': 'cpu',
'max_wavs': -1,
'autoplay': True,
'show_text': True, 'show_text': True,
'autoplay': True,
'voice_pitch': 'medium',
'voice_speed': 'medium',
} }
current_params = params.copy() current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115'] voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
wav_idx = 0 voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
last_msg_id = 0
#Used for making text xml compatible, needed for voice pitch and speed control #Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({ table = str.maketrans({
@ -55,6 +57,14 @@ def input_modifier(string):
This function is applied to your text inputs before This function is applied to your text inputs before
they are fed into the model. they are fed into the model.
""" """
#remove autoplay from previous
if len(shared.history['internal'])>0:
[text, reply] = shared.history['internal'][-1]
[visible_text, visible_reply] = shared.history['visible'][-1]
rep_clean = reply.replace('controls autoplay>','controls>')
vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
shared.history['internal'][-1] = [text, rep_clean]
shared.history['visible'][-1] = [visible_text, vis_rep_clean]
return string return string
@ -63,7 +73,7 @@ def output_modifier(string):
This function is applied to the model outputs. This function is applied to the model outputs.
""" """
global wav_idx, model, current_params global model, current_params
for i in params: for i in params:
if params[i] != current_params[i]: if params[i] != current_params[i]:
@ -81,44 +91,31 @@ def output_modifier(string):
string = string.replace('\n', ' ') string = string.replace('\n', ' ')
string = string.strip() string = string.strip()
auto_playable=True silent_string = False #Used to prevent unnecessary audio file generation
if string == '': if string == '':
string = 'empty reply, try regenerating' string = 'empty reply, try regenerating'
auto_playable=False silent_string = True
#x-slow, slow, medium, fast, x-fast #x-slow, slow, medium, fast, x-fast
#x-low, low, medium, high, x-high #x-low, low, medium, high, x-high
prosody='<prosody rate="medium" pitch="medium">' pitch = params['voice_pitch']
speed = params['voice_speed']
prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>' string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav') current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
autoplay_str = '' output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
if not shared.still_streaming: if not shared.still_streaming and not silent_string:
model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file)) model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
#diabled until autoplay doesn't run on previous messages string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
#autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay_str}></audio>\n\n'
else: else:
#placeholder so text doesnt shift around so much #placeholder so text doesn't shift around so much
string =f'<audio controls {autoplay_str}></audio>\n\n' string ='<audio controls></audio>\n\n'
#reset if too many wavs. set max to -1 for unlimited.
if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
#only increment if starting a new stream, else replace during streaming.
if not shared.still_streaming:
wav_idx += 1
else:
wav_idx = 0
if params['show_text']: if params['show_text']:
#string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
string+=orig_string string+=orig_string
if params['autoplay'] == True and auto_playable and not shared.still_streaming:
stop_autoplay()
wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
wave_obj.play()
return string return string
def bot_prefix_modifier(string): def bot_prefix_modifier(string):
@ -130,20 +127,20 @@ def bot_prefix_modifier(string):
return string return string
def stop_autoplay():
sa.stop_all()
def ui(): def ui():
# Gradio elements # Gradio elements
activate = gr.Checkbox(value=params['activate'], label='Activate TTS') with gr.Accordion("Silero TTS"):
show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player') activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically') show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
stop_audio = gr.Button("Stop Auto-Play") autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice') voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
# Event functions to update the parameters in the backend # Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None) activate.change(lambda x: params.update({"activate": x}), activate, None)
autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
show_text.change(lambda x: params.update({"show_text": x}), show_text, None) show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
stop_audio.click(stop_autoplay) autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
voice.change(lambda x: params.update({"speaker": x}), voice, None) voice.change(lambda x: params.update({"speaker": x}), voice, None)
v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)