import base64 import gc import io from pathlib import Path import gradio as gr import numpy as np import torch import whisper from pydub import AudioSegment from modules import shared input_hijack = { 'state': False, 'value': ["", ""] } # parameters which can be customized in settings.yaml of webui params = { 'whipser_language': 'english', 'whipser_model': 'small.en', 'auto_submit': True } startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device) def chat_input_modifier(text, visible_text, state): global input_hijack if input_hijack['state']: input_hijack['state'] = False return input_hijack['value'] else: return text, visible_text def do_stt(audio, whipser_language): # use pydub to convert sample_rate and sample_width for whisper input dubaudio = AudioSegment.from_file(io.BytesIO(audio)) dubaudio = dubaudio.set_channels(1) dubaudio = dubaudio.set_frame_rate(16000) dubaudio = dubaudio.set_sample_width(2) # same method to get the array as openai whisper repo used from wav file audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0 if len(whipser_language) == 0: result = WHISPERMODEL.transcribe(audio=audio_np) else: result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language) return result["text"] def auto_transcribe(audio, auto_submit, whipser_language): if audio is None or audio == "": print("Whisper received no audio data") return "", "" audio_bytes = base64.b64decode(audio.split(',')[1]) transcription = do_stt(audio_bytes, whipser_language) if auto_submit: input_hijack.update({"state": True, "value": [transcription, transcription]}) return transcription def reload_whispermodel(whisper_model_name: str, whisper_language: str, device: str): if len(whisper_model_name) > 0: global WHISPERMODEL WHISPERMODEL = None if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() if device != "none": if device == "cuda": device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') WHISPERMODEL = whisper.load_model(whisper_model_name, device=device) params.update({"whipser_model": whisper_model_name}) if ".en" in whisper_model_name: whisper_language = "english" audio_update = gr.Audio.update(interactive=True) else: audio_update = gr.Audio.update(interactive=False) return [whisper_model_name, whisper_language, str(device), audio_update] def ui(): with gr.Accordion("Whisper STT", open=True): with gr.Row(): audio = gr.Textbox(elem_id="audio-base64", visible=False) record_button = gr.Button("Rec.", elem_id="record-button", elem_classes="custom-button") with gr.Row(): with gr.Accordion("Settings", open=False): auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit']) device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"]) whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"]) whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"]) audio.change( auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then( None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}") device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio]) whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio]) whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None) auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None) def custom_js(): """ Returns custom javascript as a string. It is applied whenever the web UI is loaded. :return: """ with open(Path(__file__).parent.resolve() / "script.js", "r") as f: return f.read()