From 48aa52849bb938e62fdada0f4cebcd049e8536ec Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Sun, 12 Mar 2023 21:03:07 +0100 Subject: [PATCH] use Gradio microphone input instead --- extensions/whisper_stt/README.md | 39 ------------------------- extensions/whisper_stt/requirements.txt | 1 - extensions/whisper_stt/script.py | 27 ++++++++++------- 3 files changed, 17 insertions(+), 50 deletions(-) delete mode 100644 extensions/whisper_stt/README.md diff --git a/extensions/whisper_stt/README.md b/extensions/whisper_stt/README.md deleted file mode 100644 index 068f5dda..00000000 --- a/extensions/whisper_stt/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Installation instructions - -- On all platforms, run `pip install -r requirements.txt` in this folder -- You need **PortAudio** to run the speech recognition. Below are guides for all platforms - - -## Windows - -- You don't need to do anything, `pyaudio` already comes with PortAudio included on Windows. - -## Mac - -```commandline -brew install portaudio -brew link --overwrite portaudio -pip install pyaudio -``` - -## Linux - -- You have to use your distro's package manager to install PortAudio. - -### Ubuntu / Debian / Mint - -```commandline -sudo apt install portaudio19-dev python3-pyaudio -``` - -### Arch / Manjaro - -```commandline -sudo pacman -S portaudio -``` - -### Fedora - -```commandline -sudo dnf -y install portaudio -``` \ No newline at end of file diff --git a/extensions/whisper_stt/requirements.txt b/extensions/whisper_stt/requirements.txt index e6e3255f..770c38bb 100644 --- a/extensions/whisper_stt/requirements.txt +++ b/extensions/whisper_stt/requirements.txt @@ -1,5 +1,4 @@ git+https://github.com/Uberi/speech_recognition.git@010382b -PyAudio openai-whisper soundfile ffmpeg diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index dec1efb0..b2e840a8 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -7,22 +7,24 @@ input_hijack = { } -def do_stt(): +def do_stt(audio, text_state=""): transcription = "" r = sr.Recognizer() - with sr.Microphone() as source: - r.adjust_for_ambient_noise(source, 0.2) - audio = r.listen(source) + + # Convert to AudioData + audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) try: - transcription = r.recognize_whisper(audio, language="english", model="base.en") + transcription = r.recognize_whisper(audio_data, language="english", model="base.en") except sr.UnknownValueError: print("Whisper could not understand audio") except sr.RequestError as e: print("Could not request results from Whisper", e) input_hijack.update({"state": True, "value": [transcription, transcription]}) - return transcription + + text_state += transcription + " " + return text_state, text_state def update_hijack(val): @@ -31,7 +33,12 @@ def update_hijack(val): def ui(): - speech_button = gr.Button(value="🎙️") - output_transcription = gr.Textbox(label="STT-Input", placeholder="Speech Preview. Click \"Generate\" to send", interactive=True) - output_transcription.change(fn=update_hijack, inputs=[output_transcription]) - speech_button.click(do_stt, outputs=[output_transcription]) + tr_state = gr.State(value="") + output_transcription = gr.Textbox(label="STT-Input", + placeholder="Speech Preview. Click \"Generate\" to send", + interactive=True) + output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) + with gr.Row(): + audio = gr.Audio(source="microphone") + transcribe_button = gr.Button(value="Transcribe") + transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state])