mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
1a1e420e65
Temporarily suppress the streaming during the audio response as it would interfere with the audio (making it stutter and play anew)
167 lines
8.1 KiB
Python
167 lines
8.1 KiB
Python
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import gradio as gr
|
|
import modules.chat as chat
|
|
import modules.shared as shared
|
|
import torch
|
|
|
|
torch._C._jit_set_profiling_mode(False)
|
|
|
|
params = {
|
|
'activate': True,
|
|
'speaker': 'en_56',
|
|
'language': 'en',
|
|
'model_id': 'v3_en',
|
|
'sample_rate': 48000,
|
|
'device': 'cpu',
|
|
'show_text': False,
|
|
'autoplay': True,
|
|
'voice_pitch': 'medium',
|
|
'voice_speed': 'medium',
|
|
}
|
|
|
|
current_params = params.copy()
|
|
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
|
|
voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
|
|
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
|
|
streaming_state = shared.args.no_stream # remember if chat streaming was enabled
|
|
|
|
# Used for making text xml compatible, needed for voice pitch and speed control
|
|
table = str.maketrans({
|
|
"<": "<",
|
|
">": ">",
|
|
"&": "&",
|
|
"'": "'",
|
|
'"': """,
|
|
})
|
|
|
|
def xmlesc(txt):
|
|
return txt.translate(table)
|
|
|
|
def load_model():
|
|
model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
|
|
model.to(params['device'])
|
|
return model
|
|
model = load_model()
|
|
|
|
def remove_surrounded_chars(string):
|
|
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
|
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
|
return re.sub('\*[^\*]*?(\*|$)','',string)
|
|
|
|
def remove_tts_from_history(name1, name2):
|
|
for i, entry in enumerate(shared.history['internal']):
|
|
shared.history['visible'][i] = [shared.history['visible'][i][0], entry[1]]
|
|
return chat.generate_chat_output(shared.history['visible'], name1, name2, shared.character)
|
|
|
|
def toggle_text_in_history(name1, name2):
|
|
for i, entry in enumerate(shared.history['visible']):
|
|
visible_reply = entry[1]
|
|
if visible_reply.startswith('<audio'):
|
|
if params['show_text']:
|
|
reply = shared.history['internal'][i][1]
|
|
shared.history['visible'][i] = [shared.history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
|
|
else:
|
|
shared.history['visible'][i] = [shared.history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]
|
|
return chat.generate_chat_output(shared.history['visible'], name1, name2, shared.character)
|
|
|
|
def input_modifier(string):
|
|
"""
|
|
This function is applied to your text inputs before
|
|
they are fed into the model.
|
|
"""
|
|
|
|
# Remove autoplay from the last reply
|
|
if (shared.args.chat or shared.args.cai_chat) and len(shared.history['internal']) > 0:
|
|
shared.history['visible'][-1] = [shared.history['visible'][-1][0], shared.history['visible'][-1][1].replace('controls autoplay>','controls>')]
|
|
|
|
shared.processing_message = "*Is recording a voice message...*"
|
|
shared.args.no_stream = True # Disable streaming cause otherwise the audio output will stutter and begin anew every time the message is being updated
|
|
return string
|
|
|
|
def output_modifier(string):
|
|
"""
|
|
This function is applied to the model outputs.
|
|
"""
|
|
|
|
global model, current_params, streaming_state
|
|
|
|
for i in params:
|
|
if params[i] != current_params[i]:
|
|
model = load_model()
|
|
current_params = params.copy()
|
|
break
|
|
|
|
if params['activate'] == False:
|
|
return string
|
|
|
|
original_string = string
|
|
string = remove_surrounded_chars(string)
|
|
string = string.replace('"', '')
|
|
string = string.replace('“', '')
|
|
string = string.replace('\n', ' ')
|
|
string = string.strip()
|
|
|
|
if string == '':
|
|
string = '*Empty reply, try regenerating*'
|
|
else:
|
|
output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{int(time.time())}.wav')
|
|
prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
|
|
silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
|
|
model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
|
|
|
|
autoplay = 'autoplay' if params['autoplay'] else ''
|
|
string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
|
|
if params['show_text']:
|
|
string += f'\n\n{original_string}'
|
|
|
|
shared.processing_message = "*Is typing...*"
|
|
shared.args.no_stream = streaming_state # restore the streaming option to the previous value
|
|
return string
|
|
|
|
def bot_prefix_modifier(string):
|
|
"""
|
|
This function is only applied in chat mode. It modifies
|
|
the prefix text for the Bot and can be used to bias its
|
|
behavior.
|
|
"""
|
|
|
|
return string
|
|
|
|
def ui():
|
|
# Gradio elements
|
|
with gr.Accordion("Silero TTS"):
|
|
with gr.Row():
|
|
activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
|
|
autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
|
|
show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
|
|
voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
|
|
with gr.Row():
|
|
v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
|
|
v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
|
|
with gr.Row():
|
|
convert = gr.Button('Permanently replace audios with the message texts')
|
|
convert_cancel = gr.Button('Cancel', visible=False)
|
|
convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
|
|
|
|
# Convert history with confirmation
|
|
convert_arr = [convert_confirm, convert, convert_cancel]
|
|
convert.click(lambda :[gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
|
|
convert_confirm.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
|
|
convert_confirm.click(remove_tts_from_history, [shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'])
|
|
convert_confirm.click(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
|
|
convert_cancel.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
|
|
|
|
# Toggle message text in history
|
|
show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
|
|
show_text.change(toggle_text_in_history, [shared.gradio['name1'], shared.gradio['name2']], shared.gradio['display'])
|
|
show_text.change(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
|
|
|
|
# Event functions to update the parameters in the backend
|
|
activate.change(lambda x: params.update({"activate": x}), activate, None)
|
|
autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
|
|
voice.change(lambda x: params.update({"speaker": x}), voice, None)
|
|
v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
|
|
v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None) |