text-generation-webui/extensions/whisper_stt/script.py

import base64
import gc
import io
from pathlib import Path

import gradio as gr
import numpy as np
import torch
import whisper
from pydub import AudioSegment

from modules import shared

input_hijack = {
    'state': False,
    'value': ["", ""]
}

# parameters which can be customized in settings.yaml of webui
params = {
    'whipser_language': 'english',
    'whipser_model': 'small.en',
    'auto_submit': True
}

startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device)


def chat_input_modifier(text, visible_text, state):
    global input_hijack
    if input_hijack['state']:
        input_hijack['state'] = False
        return input_hijack['value']
    else:
        return text, visible_text


def do_stt(audio, whipser_language):
    # use pydub to convert sample_rate and sample_width for whisper input
    dubaudio = AudioSegment.from_file(io.BytesIO(audio))
    dubaudio = dubaudio.set_channels(1)
    dubaudio = dubaudio.set_frame_rate(16000)
    dubaudio = dubaudio.set_sample_width(2)

    # same method to get the array as openai whisper repo used from wav file
    audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0

    if len(whipser_language) == 0:
        result = WHISPERMODEL.transcribe(audio=audio_np)
    else:
        result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language)
    return result["text"]


def auto_transcribe(audio, auto_submit, whipser_language):
    if audio is None or audio == "":
        print("Whisper received no audio data")
        return "", ""
    audio_bytes = base64.b64decode(audio.split(',')[1])

    transcription = do_stt(audio_bytes, whipser_language)
    if auto_submit:
        input_hijack.update({"state": True, "value": [transcription, transcription]})
    return transcription


def reload_whispermodel(whisper_model_name: str, whisper_language: str, device: str):
    if len(whisper_model_name) > 0:
        global WHISPERMODEL
        WHISPERMODEL = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

        if device != "none":
            if device == "cuda":
                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

            WHISPERMODEL = whisper.load_model(whisper_model_name, device=device)
            params.update({"whipser_model": whisper_model_name})
            if ".en" in whisper_model_name:
                whisper_language = "english"
            audio_update = gr.Audio.update(interactive=True)
        else:
            audio_update = gr.Audio.update(interactive=False)
        return [whisper_model_name, whisper_language, str(device), audio_update]


def ui():
    with gr.Accordion("Whisper STT", open=True):
        with gr.Row():
            audio = gr.Textbox(elem_id="audio-base64", visible=False)
            record_button = gr.Button("Rec.", elem_id="record-button", elem_classes="custom-button")
        with gr.Row():
            with gr.Accordion("Settings", open=False):
                auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
                device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])
                whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])

    audio.change(
        auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then(
        None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")

    device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
    whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
    whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None)
    auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)


def custom_js():
    """
    Returns custom javascript as a string. It is applied whenever the web UI is
    loaded.
    :return:
    """
    with open(Path(__file__).parent.resolve() / "script.js", "r") as f:
        return f.read()
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`import base64`
			`import gc`
			`import io`
Addressing Whisper STT issues (#5929) 2024-06-29 06:32:54 +02:00			`from pathlib import Path`

first implementation 2023-03-09 12:46:50 +01:00			`import gradio as gr`
Addressing Whisper STT issues (#5929) 2024-06-29 06:32:54 +02:00			`import numpy as np`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`import torch`
			`import whisper`
			`from pydub import AudioSegment`
Style improvements (#1957) 2023-05-10 03:49:39 +02:00
Auto-submit the whisper extension transcription 2023-04-07 20:57:29 +02:00			`from modules import shared`
cleanup 2023-03-09 20:33:00 +01:00
first implementation 2023-03-09 12:46:50 +01:00			`input_hijack = {`
			`'state': False,`
			`'value': ["", ""]`
			`}`

Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`# parameters which can be customized in settings.yaml of webui`
Whisper_stt params for model, language, and auto_submit (#3031) 2023-07-08 01:54:53 +02:00			`params = {`
			`'whipser_language': 'english',`
			`'whipser_model': 'small.en',`
			`'auto_submit': True`
			`}`

Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')`
			`WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device)`

first implementation 2023-03-09 12:46:50 +01:00
Add extension example, replace input_hijack with chat_input_modifier (#3307) 2023-07-25 23:49:56 +02:00			`def chat_input_modifier(text, visible_text, state):`
			`global input_hijack`
			`if input_hijack['state']:`
			`input_hijack['state'] = False`
			`return input_hijack['value']`
			`else:`
			`return text, visible_text`


Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`def do_stt(audio, whipser_language):`
			`# use pydub to convert sample_rate and sample_width for whisper input`
			`dubaudio = AudioSegment.from_file(io.BytesIO(audio))`
			`dubaudio = dubaudio.set_channels(1)`
			`dubaudio = dubaudio.set_frame_rate(16000)`
			`dubaudio = dubaudio.set_sample_width(2)`
first implementation 2023-03-09 12:46:50 +01:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`# same method to get the array as openai whisper repo used from wav file`
			`audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0`
first implementation 2023-03-09 12:46:50 +01:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`if len(whipser_language) == 0:`
			`result = WHISPERMODEL.transcribe(audio=audio_np)`
			`else:`
			`result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language)`
			`return result["text"]`
use Gradio microphone input instead 2023-03-12 21:03:07 +01:00
first implementation 2023-03-09 12:46:50 +01:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`def auto_transcribe(audio, auto_submit, whipser_language):`
			`if audio is None or audio == "":`
			`print("Whisper received no audio data")`
add auto-transcribe option 2023-03-13 21:41:19 +01:00			`return "", ""`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`audio_bytes = base64.b64decode(audio.split(',')[1])`

			`transcription = do_stt(audio_bytes, whipser_language)`
Auto-submit the whisper extension transcription 2023-04-07 20:57:29 +02:00			`if auto_submit:`
			`input_hijack.update({"state": True, "value": [transcription, transcription]})`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`return transcription`


			`def reload_whispermodel(whisper_model_name: str, whisper_language: str, device: str):`
			`if len(whisper_model_name) > 0:`
			`global WHISPERMODEL`
			`WHISPERMODEL = None`
			`if torch.cuda.is_available():`
			`torch.cuda.empty_cache()`
			`gc.collect()`

			`if device != "none":`
			`if device == "cuda":`
			`device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')`
Auto-submit the whisper extension transcription 2023-04-07 20:57:29 +02:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`WHISPERMODEL = whisper.load_model(whisper_model_name, device=device)`
			`params.update({"whipser_model": whisper_model_name})`
			`if ".en" in whisper_model_name:`
			`whisper_language = "english"`
			`audio_update = gr.Audio.update(interactive=True)`
			`else:`
			`audio_update = gr.Audio.update(interactive=False)`
			`return [whisper_model_name, whisper_language, str(device), audio_update]`
add auto-transcribe option 2023-03-13 21:41:19 +01:00

first implementation 2023-03-09 12:46:50 +01:00			`def ui():`
Whisper_stt params for model, language, and auto_submit (#3031) 2023-07-08 01:54:53 +02:00			`with gr.Accordion("Whisper STT", open=True):`
			`with gr.Row():`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`audio = gr.Textbox(elem_id="audio-base64", visible=False)`
			`record_button = gr.Button("Rec.", elem_id="record-button", elem_classes="custom-button")`
Whisper_stt params for model, language, and auto_submit (#3031) 2023-07-08 01:54:53 +02:00			`with gr.Row():`
			`with gr.Accordion("Settings", open=False):`
			`auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])`
Add whisper turbo (#6423) 2024-10-01 22:49:35 +02:00			`whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])`
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
Reorganize whisper extension 2023-04-13 15:40:27 +02:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`audio.change(`
			`auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then(`
			`None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")`
Add extension example, replace input_hijack with chat_input_modifier (#3307) 2023-07-25 23:49:56 +02:00
Whisper stt overhaul js (#6194) --------- Co-authored-by: RandoInternetPreson <aaronalai1@gmail.com> 2024-07-02 04:27:18 +02:00			`device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])`
			`whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])`
			`whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None)`
Whisper_stt params for model, language, and auto_submit (#3031) 2023-07-08 01:54:53 +02:00			`auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)`
Addressing Whisper STT issues (#5929) 2024-06-29 06:32:54 +02:00

			`def custom_js():`
			`"""`
			`Returns custom javascript as a string. It is applied whenever the web UI is`
			`loaded.`
			`:return:`
			`"""`
			`with open(Path(__file__).parent.resolve() / "script.js", "r") as f:`
			`return f.read()`