text-generation-webui/extensions/whisper_stt/script.py

45 lines
1.3 KiB
Python
Raw Normal View History

2023-03-09 12:46:50 +01:00
import gradio as gr
import speech_recognition as sr
from modules import shared
2023-03-09 20:33:00 +01:00
2023-03-09 12:46:50 +01:00
input_hijack = {
'state': False,
'value': ["", ""]
}
def do_stt(audio):
2023-03-09 12:46:50 +01:00
transcription = ""
r = sr.Recognizer()
2023-03-12 21:03:07 +01:00
# Convert to AudioData
audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
2023-03-09 12:46:50 +01:00
try:
2023-03-12 21:03:07 +01:00
transcription = r.recognize_whisper(audio_data, language="english", model="base.en")
2023-03-09 12:46:50 +01:00
except sr.UnknownValueError:
print("Whisper could not understand audio")
except sr.RequestError as e:
2023-03-09 20:33:00 +01:00
print("Could not request results from Whisper", e)
2023-03-09 12:46:50 +01:00
return transcription
2023-03-12 21:03:07 +01:00
2023-03-09 12:46:50 +01:00
def auto_transcribe(audio, auto_submit):
2023-03-13 21:41:19 +01:00
if audio is None:
return "", ""
transcription = do_stt(audio)
if auto_submit:
input_hijack.update({"state": True, "value": [transcription, transcription]})
return transcription, None
2023-03-13 21:41:19 +01:00
2023-03-09 12:46:50 +01:00
def ui():
2023-03-12 21:03:07 +01:00
with gr.Row():
audio = gr.Audio(source="microphone")
auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=True)
audio.change(fn=auto_transcribe, inputs=[audio, auto_submit], outputs=[shared.gradio['textbox'], audio])
audio.change(None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")