From 4c72e43bcfb70102a7330a5748ca2ea0989f0ffb Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Thu, 9 Mar 2023 12:46:50 +0100 Subject: [PATCH 1/7] first implementation --- extensions/whisper_stt/requirements.txt | 5 ++++ extensions/whisper_stt/script.py | 40 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 extensions/whisper_stt/requirements.txt create mode 100644 extensions/whisper_stt/script.py diff --git a/extensions/whisper_stt/requirements.txt b/extensions/whisper_stt/requirements.txt new file mode 100644 index 00000000..e6e3255f --- /dev/null +++ b/extensions/whisper_stt/requirements.txt @@ -0,0 +1,5 @@ +git+https://github.com/Uberi/speech_recognition.git@010382b +PyAudio +openai-whisper +soundfile +ffmpeg diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py new file mode 100644 index 00000000..287a1fdd --- /dev/null +++ b/extensions/whisper_stt/script.py @@ -0,0 +1,40 @@ +import gradio as gr +import speech_recognition as sr +import modules.shared as shared + +input_hijack = { + 'state': False, + 'value': ["", ""] +} + + +def input_modifier(string): + return string + + +def do_stt(): + transcription = "" + r = sr.Recognizer() + with sr.Microphone() as source: + print("Say something!") + r.adjust_for_ambient_noise(source) + audio = r.listen(source) + + # recognize speech using whisper + try: + transcription = r.recognize_whisper(audio, language="english", model="tiny.en") + print("Whisper thinks you said " + transcription) + except sr.UnknownValueError: + print("Whisper could not understand audio") + except sr.RequestError as e: + print("Could not request results from Whisper") + + # input_modifier(transcription) + input_hijack.update({"state": True, "value": [transcription, transcription]}) + return transcription + + +def ui(): + speech_button = gr.Button(value="STT") + output_transcription = gr.Textbox(label="Speech Preview") + speech_button.click(do_stt, outputs=[output_transcription]) From 7a03d0bda357cb781f8675f42caa35a69b79f613 Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Thu, 9 Mar 2023 20:33:00 +0100 Subject: [PATCH 2/7] cleanup --- extensions/whisper_stt/script.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index 287a1fdd..dcb4dfc9 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -1,6 +1,6 @@ import gradio as gr import speech_recognition as sr -import modules.shared as shared + input_hijack = { 'state': False, @@ -16,25 +16,21 @@ def do_stt(): transcription = "" r = sr.Recognizer() with sr.Microphone() as source: - print("Say something!") r.adjust_for_ambient_noise(source) audio = r.listen(source) - # recognize speech using whisper try: transcription = r.recognize_whisper(audio, language="english", model="tiny.en") - print("Whisper thinks you said " + transcription) except sr.UnknownValueError: print("Whisper could not understand audio") except sr.RequestError as e: - print("Could not request results from Whisper") + print("Could not request results from Whisper", e) - # input_modifier(transcription) input_hijack.update({"state": True, "value": [transcription, transcription]}) return transcription def ui(): - speech_button = gr.Button(value="STT") - output_transcription = gr.Textbox(label="Speech Preview") + speech_button = gr.Button(value="🎙️") + output_transcription = gr.Textbox(label="STT-Preview", placeholder="Speech Preview. Click \"Generate\" to send") speech_button.click(do_stt, outputs=[output_transcription]) From 00359ba054797d5115c30a292f0f919c514f0046 Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Thu, 9 Mar 2023 21:03:49 +0100 Subject: [PATCH 3/7] interactive preview window --- extensions/whisper_stt/script.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index dcb4dfc9..9f07e5c9 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -1,17 +1,12 @@ import gradio as gr import speech_recognition as sr - input_hijack = { 'state': False, 'value': ["", ""] } -def input_modifier(string): - return string - - def do_stt(): transcription = "" r = sr.Recognizer() @@ -30,7 +25,13 @@ def do_stt(): return transcription +def update_hijack(val): + input_hijack.update({"state": True, "value": [val, val]}) + return val + + def ui(): speech_button = gr.Button(value="🎙️") - output_transcription = gr.Textbox(label="STT-Preview", placeholder="Speech Preview. Click \"Generate\" to send") + output_transcription = gr.Textbox(label="STT-Input", placeholder="Speech Preview. Click \"Generate\" to send", interactive=True) + output_transcription.change(fn=update_hijack, inputs=[output_transcription]) speech_button.click(do_stt, outputs=[output_transcription]) From a24fa781f1627ffc0c15cf56f1eb1b1f8ee26876 Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Thu, 9 Mar 2023 21:18:46 +0100 Subject: [PATCH 4/7] tweaked Whisper parameters --- extensions/whisper_stt/script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index 9f07e5c9..dec1efb0 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -11,11 +11,11 @@ def do_stt(): transcription = "" r = sr.Recognizer() with sr.Microphone() as source: - r.adjust_for_ambient_noise(source) + r.adjust_for_ambient_noise(source, 0.2) audio = r.listen(source) try: - transcription = r.recognize_whisper(audio, language="english", model="tiny.en") + transcription = r.recognize_whisper(audio, language="english", model="base.en") except sr.UnknownValueError: print("Whisper could not understand audio") except sr.RequestError as e: From 1c0bda33fb713ad7d3811300babf606e57253e8d Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Fri, 10 Mar 2023 11:47:16 +0100 Subject: [PATCH 5/7] added installation instructions --- extensions/whisper_stt/README.md | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 extensions/whisper_stt/README.md diff --git a/extensions/whisper_stt/README.md b/extensions/whisper_stt/README.md new file mode 100644 index 00000000..068f5dda --- /dev/null +++ b/extensions/whisper_stt/README.md @@ -0,0 +1,39 @@ +# Installation instructions + +- On all platforms, run `pip install -r requirements.txt` in this folder +- You need **PortAudio** to run the speech recognition. Below are guides for all platforms + + +## Windows + +- You don't need to do anything, `pyaudio` already comes with PortAudio included on Windows. + +## Mac + +```commandline +brew install portaudio +brew link --overwrite portaudio +pip install pyaudio +``` + +## Linux + +- You have to use your distro's package manager to install PortAudio. + +### Ubuntu / Debian / Mint + +```commandline +sudo apt install portaudio19-dev python3-pyaudio +``` + +### Arch / Manjaro + +```commandline +sudo pacman -S portaudio +``` + +### Fedora + +```commandline +sudo dnf -y install portaudio +``` \ No newline at end of file From 48aa52849bb938e62fdada0f4cebcd049e8536ec Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Sun, 12 Mar 2023 21:03:07 +0100 Subject: [PATCH 6/7] use Gradio microphone input instead --- extensions/whisper_stt/README.md | 39 ------------------------- extensions/whisper_stt/requirements.txt | 1 - extensions/whisper_stt/script.py | 27 ++++++++++------- 3 files changed, 17 insertions(+), 50 deletions(-) delete mode 100644 extensions/whisper_stt/README.md diff --git a/extensions/whisper_stt/README.md b/extensions/whisper_stt/README.md deleted file mode 100644 index 068f5dda..00000000 --- a/extensions/whisper_stt/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Installation instructions - -- On all platforms, run `pip install -r requirements.txt` in this folder -- You need **PortAudio** to run the speech recognition. Below are guides for all platforms - - -## Windows - -- You don't need to do anything, `pyaudio` already comes with PortAudio included on Windows. - -## Mac - -```commandline -brew install portaudio -brew link --overwrite portaudio -pip install pyaudio -``` - -## Linux - -- You have to use your distro's package manager to install PortAudio. - -### Ubuntu / Debian / Mint - -```commandline -sudo apt install portaudio19-dev python3-pyaudio -``` - -### Arch / Manjaro - -```commandline -sudo pacman -S portaudio -``` - -### Fedora - -```commandline -sudo dnf -y install portaudio -``` \ No newline at end of file diff --git a/extensions/whisper_stt/requirements.txt b/extensions/whisper_stt/requirements.txt index e6e3255f..770c38bb 100644 --- a/extensions/whisper_stt/requirements.txt +++ b/extensions/whisper_stt/requirements.txt @@ -1,5 +1,4 @@ git+https://github.com/Uberi/speech_recognition.git@010382b -PyAudio openai-whisper soundfile ffmpeg diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index dec1efb0..b2e840a8 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -7,22 +7,24 @@ input_hijack = { } -def do_stt(): +def do_stt(audio, text_state=""): transcription = "" r = sr.Recognizer() - with sr.Microphone() as source: - r.adjust_for_ambient_noise(source, 0.2) - audio = r.listen(source) + + # Convert to AudioData + audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) try: - transcription = r.recognize_whisper(audio, language="english", model="base.en") + transcription = r.recognize_whisper(audio_data, language="english", model="base.en") except sr.UnknownValueError: print("Whisper could not understand audio") except sr.RequestError as e: print("Could not request results from Whisper", e) input_hijack.update({"state": True, "value": [transcription, transcription]}) - return transcription + + text_state += transcription + " " + return text_state, text_state def update_hijack(val): @@ -31,7 +33,12 @@ def update_hijack(val): def ui(): - speech_button = gr.Button(value="🎙️") - output_transcription = gr.Textbox(label="STT-Input", placeholder="Speech Preview. Click \"Generate\" to send", interactive=True) - output_transcription.change(fn=update_hijack, inputs=[output_transcription]) - speech_button.click(do_stt, outputs=[output_transcription]) + tr_state = gr.State(value="") + output_transcription = gr.Textbox(label="STT-Input", + placeholder="Speech Preview. Click \"Generate\" to send", + interactive=True) + output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) + with gr.Row(): + audio = gr.Audio(source="microphone") + transcribe_button = gr.Button(value="Transcribe") + transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state]) From 02e1113d955832990cc97a0c00315753e7100837 Mon Sep 17 00:00:00 2001 From: EliasVincent Date: Mon, 13 Mar 2023 21:41:19 +0100 Subject: [PATCH 7/7] add auto-transcribe option --- extensions/whisper_stt/script.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index b2e840a8..6ef60c57 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -32,13 +32,23 @@ def update_hijack(val): return val +def auto_transcribe(audio, audio_auto, text_state=""): + if audio is None: + return "", "" + if audio_auto: + return do_stt(audio, text_state) + return "", "" + + def ui(): tr_state = gr.State(value="") output_transcription = gr.Textbox(label="STT-Input", placeholder="Speech Preview. Click \"Generate\" to send", interactive=True) output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) + audio_auto = gr.Checkbox(label="Auto-Transcribe", value=True) with gr.Row(): audio = gr.Audio(source="microphone") + audio.change(fn=auto_transcribe, inputs=[audio, audio_auto, tr_state], outputs=[output_transcription, tr_state]) transcribe_button = gr.Button(value="Transcribe") transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state])