From acf24ebb49d0077796508ab05630d8d5ae3df14d Mon Sep 17 00:00:00 2001 From: Brandon McClure Date: Fri, 7 Jul 2023 17:54:53 -0600 Subject: [PATCH] Whisper_stt params for model, language, and auto_submit (#3031) --- extensions/whisper_stt/readme.md | 15 +++++++++++++++ extensions/whisper_stt/script.py | 32 +++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 extensions/whisper_stt/readme.md diff --git a/extensions/whisper_stt/readme.md b/extensions/whisper_stt/readme.md new file mode 100644 index 00000000..cd9abbf6 --- /dev/null +++ b/extensions/whisper_stt/readme.md @@ -0,0 +1,15 @@ +# whisper_stt + +Allows you to enter your inputs in chat mode using your microphone. + +## Settings + +To adjust your default settings, you can add the following to your settings.yaml file. + +``` +whisper_stt-whipser_language: chinese +whisper_stt-whipser_model: tiny +whisper_stt-auto_submit: False +``` + +See source documentation for [model names](https://github.com/openai/whisper#available-models-and-languages) and (languages)[https://github.com/openai/whisper/blob/main/whisper/tokenizer.py] you can use. \ No newline at end of file diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index 32226404..44a9ac81 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -8,8 +8,15 @@ input_hijack = { 'value': ["", ""] } +# parameters which can be customized in settings.json of webui +params = { + 'whipser_language': 'english', + 'whipser_model': 'small.en', + 'auto_submit': True +} -def do_stt(audio): + +def do_stt(audio,whipser_model,whipser_language): transcription = "" r = sr.Recognizer() @@ -17,7 +24,7 @@ def do_stt(audio): audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) try: - transcription = r.recognize_whisper(audio_data, language="english", model="base.en") + transcription = r.recognize_whisper(audio_data, language=whipser_language, model=whipser_model) except sr.UnknownValueError: print("Whisper could not understand audio") except sr.RequestError as e: @@ -26,11 +33,10 @@ def do_stt(audio): return transcription -def auto_transcribe(audio, auto_submit): +def auto_transcribe(audio, auto_submit,whipser_model,whipser_language): if audio is None: return "", "" - - transcription = do_stt(audio) + transcription = do_stt(audio,whipser_model,whipser_language) if auto_submit: input_hijack.update({"state": True, "value": [transcription, transcription]}) @@ -38,10 +44,18 @@ def auto_transcribe(audio, auto_submit): def ui(): - with gr.Row(): - audio = gr.Audio(source="microphone") - auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=True) + with gr.Accordion("Whisper STT", open=True): + with gr.Row(): + audio = gr.Audio(source="microphone") + with gr.Row(): + with gr.Accordion("Settings", open=False): + auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit']) + whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'],choices=["tiny.en","base.en", "small.en","medium.en","tiny","base","small","medium","large"]) + whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'],choices=["chinese","german","spanish","russian","korean","french","japanese","portuguese","turkish","polish","catalan","dutch","arabic","swedish","italian","indonesian","hindi","finnish","vietnamese","hebrew","ukrainian","greek","malay","czech","romanian","danish","hungarian","tamil","norwegian","thai","urdu","croatian","bulgarian","lithuanian","latin","maori","malayalam","welsh","slovak","telugu","persian","latvian","bengali","serbian","azerbaijani","slovenian","kannada","estonian","macedonian","breton","basque","icelandic","armenian","nepali","mongolian","bosnian","kazakh","albanian","swahili","galician","marathi","punjabi","sinhala","khmer","shona","yoruba","somali","afrikaans","occitan","georgian","belarusian","tajik","sindhi","gujarati","amharic","yiddish","lao","uzbek","faroese","haitian creole","pashto","turkmen","nynorsk","maltese","sanskrit","luxembourgish","myanmar","tibetan","tagalog","malagasy","assamese","tatar","hawaiian","lingala","hausa","bashkir","javanese","sundanese"]) audio.change( - auto_transcribe, [audio, auto_submit], [shared.gradio['textbox'], audio]).then( + auto_transcribe, [audio, auto_submit,whipser_model,whipser_language], [shared.gradio['textbox'], audio]).then( None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}") + whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None) + whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None) + auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)