Merge remote-tracking branch 'origin/dev' into fix_trailing_spaces

2024-11-29 10:59:32 +01:00 · 2024-04-22 00:16:14 -04:00 · 2024-04-22 00:16:14 -04:00 · 857a2ca5b3
commit 857a2ca5b3
parent 41aa53ccf2 0877741b03
15 changed files with 72 additions and 48 deletions
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@ -135,6 +135,7 @@ def convert_history(history):
    current_message = ""
    current_reply = ""
    user_input = ""
+    user_input_last = True
    system_message = ""

    # Multimodal: convert OpenAI format to multimodal extension format
@ -188,6 +189,7 @@ def convert_history(history):

        if role == "user":
            user_input = content
+            user_input_last = True
            if current_message:
                chat_dialogue.append([current_message, ''])
                current_message = ""
@ -195,6 +197,7 @@ def convert_history(history):
            current_message = content
        elif role == "assistant":
            current_reply = content
+            user_input_last = False
            if current_message:
                chat_dialogue.append([current_message, current_reply])
                current_message = ""
@ -204,13 +207,13 @@ def convert_history(history):
        elif role == "system":
            system_message = content

-    # if current_message:
-    #     chat_dialogue.append([current_message, ''])
+    if not user_input_last:
+        user_input = ""

    return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)}


-def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -> dict:
+def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
    if body.get('functions', []):
        raise InvalidRequestError(message="functions is not supported.", param='functions')

@ -310,14 +313,18 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
        #    chunk[resp_list][0]["logprobs"] = None
        return chunk

-    if stream:
-        yield chat_streaming_chunk('')
-
    # generate reply #######################################
-    prompt = generate_chat_prompt(user_input, generate_params)
+    prompt = generate_chat_prompt(user_input, generate_params, _continue=continue_)
+    if prompt_only:
+        yield {'prompt': prompt}
+        return
+
    token_count = len(encode(prompt)[0])
    debug_msg({'prompt': prompt, 'generate_params': generate_params})

+    if stream:
+        yield chat_streaming_chunk('')
+
    generator = generate_chat_reply(
        user_input, generate_params, regenerate=False, _continue=continue_, loading_message=False)

--- a/extensions/openai/models.py
+++ b/extensions/openai/models.py
@ -9,7 +9,8 @@ from modules.utils import get_available_loras, get_available_models
 def get_current_model_info():
    return {
        'model_name': shared.model_name,
-        'lora_names': shared.lora_names
+        'lora_names': shared.lora_names,
+        'loader': shared.args.loader
    }


--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -3,6 +3,7 @@ import json
 import logging
 import os
 import traceback
+from collections import deque
 from threading import Thread

 import speech_recognition as sr
@ -31,6 +32,7 @@ from modules.text_generation import stop_everything_event
 from .typing import (
    ChatCompletionRequest,
    ChatCompletionResponse,
+    ChatPromptResponse,
    CompletionRequest,
    CompletionResponse,
    DecodeRequest,
@ -259,6 +261,15 @@ async def handle_logits(request_data: LogitsRequest):
    return JSONResponse(response)


+@app.post('/v1/internal/chat-prompt', response_model=ChatPromptResponse, dependencies=check_key)
+async def handle_chat_prompt(request: Request, request_data: ChatCompletionRequest):
+    path = request.url.path
+    is_legacy = "/generate" in path
+    generator = OAIcompletions.chat_completions_common(to_dict(request_data), is_legacy=is_legacy, prompt_only=True)
+    response = deque(generator, maxlen=1).pop()
+    return JSONResponse(response)
+
+
@app.post("/v1/internal/stop-generation", dependencies=check_key)
 async def handle_stop_generation(request: Request):
    stop_everything_event()
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@ -124,6 +124,10 @@ class ChatCompletionResponse(BaseModel):
    usage: dict


+class ChatPromptResponse(BaseModel):
+    prompt: str
+
+
 class EmbeddingsRequest(BaseModel):
    input: str | List[str] | List[int] | List[List[int]]
    model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.")
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@ -62,7 +62,7 @@ def ui():
                whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
                whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])

-    audio.change(
+    audio.stop_recording(
        auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
        None, auto_submit, None, js="(check) => {if (check) { document.getElementById('Generate').click() }}")

--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -136,9 +136,6 @@ def get_model_metadata(model):
    if 'instruction_template' not in model_settings:
        model_settings['instruction_template'] = 'Alpaca'

-    if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
-        model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])
-
    # Ignore rope_freq_base if set to the default value
    if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
        model_settings.pop('rope_freq_base')
@ -150,6 +147,10 @@ def get_model_metadata(model):
            for k in settings[pat]:
                model_settings[k] = settings[pat][k]

+    # Load instruction template if defined by name rather than by value
+    if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
+        model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])
+
    return model_settings


--- a/requirements.txt
+++ b/requirements.txt
@ -23,14 +23,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
@ -42,9 +42,9 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/ro
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.61+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
@ -40,9 +40,9 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # Mac wheels
@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # Mac wheels
@ -40,4 +40,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.61-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -21,14 +21,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -23,14 +23,14 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -21,12 +21,12 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.39.*
+transformers==4.40.*
 tqdm
 wandb

 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
-sse-starlette==2.1.0
+sse-starlette==1.6.5
 tiktoken