Minor style changes to silero_tts

This commit is contained in:
oobabooga 2023-03-11 11:17:13 -03:00
parent 33df4bd91f
commit 8f8da6707d

View File

@ -14,18 +14,19 @@ params = {
'model_id': 'v3_en', 'model_id': 'v3_en',
'sample_rate': 48000, 'sample_rate': 48000,
'device': 'cpu', 'device': 'cpu',
'show_text': True, 'show_text': False,
'autoplay': True, 'autoplay': True,
'voice_pitch': 'medium', 'voice_pitch': 'medium',
'voice_speed': 'medium', 'voice_speed': 'medium',
} }
current_params = params.copy() current_params = params.copy()
voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115'] voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high'] voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
last_msg_id = 0 last_msg_id = 0
#Used for making text xml compatible, needed for voice pitch and speed control # Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({ table = str.maketrans({
"<": "&lt;", "<": "&lt;",
">": "&gt;", ">": "&gt;",
@ -33,6 +34,7 @@ table = str.maketrans({
"'": "&apos;", "'": "&apos;",
'"': "&quot;", '"': "&quot;",
}) })
def xmlesc(txt): def xmlesc(txt):
return txt.translate(table) return txt.translate(table)
@ -57,7 +59,8 @@ def input_modifier(string):
This function is applied to your text inputs before This function is applied to your text inputs before
they are fed into the model. they are fed into the model.
""" """
#remove autoplay from previous
# Remove autoplay from previous
if len(shared.history['internal'])>0: if len(shared.history['internal'])>0:
[text, reply] = shared.history['internal'][-1] [text, reply] = shared.history['internal'][-1]
[visible_text, visible_reply] = shared.history['visible'][-1] [visible_text, visible_reply] = shared.history['visible'][-1]
@ -91,30 +94,30 @@ def output_modifier(string):
string = string.replace('\n', ' ') string = string.replace('\n', ' ')
string = string.strip() string = string.strip()
silent_string = False #Used to prevent unnecessary audio file generation silent_string = False # Used to prevent unnecessary audio file generation
if string == '': if string == '':
string = 'empty reply, try regenerating' string = 'empty reply, try regenerating'
silent_string = True silent_string = True
#x-slow, slow, medium, fast, x-fast # x-slow, slow, medium, fast, x-fast
#x-low, low, medium, high, x-high # x-low, low, medium, high, x-high
pitch = params['voice_pitch'] pitch = params['voice_pitch']
speed = params['voice_speed'] speed = params['voice_speed']
prosody=f'<prosody rate="{speed}" pitch="{pitch}">' prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>' string = '<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav') output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
if not shared.still_streaming and not silent_string: if not shared.still_streaming and not silent_string:
model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file)) model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n' string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
else: else:
#placeholder so text doesn't shift around so much # Placeholder so text doesn't shift around so much
string ='<audio controls></audio>\n\n' string = '<audio controls></audio>\n\n'
if params['show_text']: if params['show_text']:
#string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly. #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
string+=orig_string string += orig_string
return string return string