From cc825dd1f4dae499f525d7286446c38572cb78b0 Mon Sep 17 00:00:00 2001 From: mamei16 Date: Sat, 29 Jun 2024 06:32:54 +0200 Subject: [PATCH] Addressing Whisper STT issues (#5929) --- extensions/whisper_stt/script.js | 25 +++++++++++++++++++++++++ extensions/whisper_stt/script.py | 20 +++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 extensions/whisper_stt/script.js diff --git a/extensions/whisper_stt/script.js b/extensions/whisper_stt/script.js new file mode 100644 index 00000000..fff2b297 --- /dev/null +++ b/extensions/whisper_stt/script.js @@ -0,0 +1,25 @@ +var recButton = document.getElementsByClassName("record-button")[0].cloneNode(true); +var generate_button = document.getElementById("Generate"); +generate_button.insertAdjacentElement("afterend", recButton); + +recButton.style.setProperty("margin-left", "-10px"); +recButton.innerText = "Rec."; + + +recButton.addEventListener("click", function() { + var originalRecordButton = document.getElementsByClassName("record-button")[1]; + originalRecordButton.click(); + + var stopRecordButtons = document.getElementsByClassName("stop-button"); + if (stopRecordButtons.length > 1) generate_button.parentElement.removeChild(stopRecordButtons[0]); + var stopRecordButton = document.getElementsByClassName("stop-button")[0]; + generate_button.insertAdjacentElement("afterend", stopRecordButton); + + //stopRecordButton.style.setProperty("margin-left", "-10px"); + stopRecordButton.style.setProperty("padding-right", "10px"); + recButton.style.display = "none"; + + stopRecordButton.addEventListener("click", function() { + recButton.style.display = "flex"; + }); +}); \ No newline at end of file diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index efc62f41..f52d2542 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -1,5 +1,8 @@ +from pathlib import Path + import gradio as gr import speech_recognition as sr +import numpy as np from modules import shared @@ -45,6 +48,11 @@ def do_stt(audio, whipser_model, whipser_language): def auto_transcribe(audio, auto_submit, whipser_model, whipser_language): if audio is None: return "", "" + sample_rate, audio_data = audio + if not isinstance(audio_data[0], np.ndarray): # workaround for chrome audio. Mono? + # Convert to 2 channels, so each sample s_i consists of the same value in both channels [val_i, val_i] + audio_data = np.column_stack((audio_data, audio_data)) + audio = (sample_rate, audio_data) transcription = do_stt(audio, whipser_model, whipser_language) if auto_submit: input_hijack.update({"state": True, "value": [transcription, transcription]}) @@ -55,7 +63,7 @@ def auto_transcribe(audio, auto_submit, whipser_model, whipser_language): def ui(): with gr.Accordion("Whisper STT", open=True): with gr.Row(): - audio = gr.Audio(source="microphone") + audio = gr.Audio(source="microphone", type="numpy") with gr.Row(): with gr.Accordion("Settings", open=False): auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit']) @@ -69,3 +77,13 @@ def ui(): whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None) whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None) auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None) + + +def custom_js(): + """ + Returns custom javascript as a string. It is applied whenever the web UI is + loaded. + :return: + """ + with open(Path(__file__).parent.resolve() / "script.js", "r") as f: + return f.read()