Run in executor for long blocking functions.

2024-11-26 01:30:20 +01:00 · 2024-04-28 08:24:45 -04:00 · 2024-04-28 08:24:45 -04:00 · 3ffb09d465
commit 3ffb09d465
parent 5770e06c48
2 changed files with 114 additions and 26 deletions
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -23,11 +23,12 @@ import extensions.openai.models as OAImodels
 import extensions.openai.moderations as OAImoderations
 from extensions.openai.errors import ServiceUnavailableError
 from extensions.openai.tokens import token_count, token_decode, token_encode
-from extensions.openai.utils import _start_cloudflared
+from extensions.openai.utils import _start_cloudflared, generate_in_executor, run_in_executor
 from modules import shared
 from modules.logging_colors import logger
 from modules.models import unload_model
 from modules.text_generation import stop_everything_event
 import functools
 from .typing import (
    ChatCompletionRequest,
@ -59,8 +60,12 @@ params = {
    'debug': 0
 }
-
+# Allow some actions to run at the same time.
-streaming_semaphore = asyncio.Semaphore(1)
+text_generation_semaphore = asyncio.Semaphore(1)  # Use same lock for streaming and generations.
 embedding_semaphore = asyncio.Semaphore(1)
 stt_semaphore = asyncio.Semaphore(1)
 io_semaphore = asyncio.Semaphore(1)
 small_tasks_semaphore = asyncio.Semaphore(5)
 def verify_api_key(authorization: str = Header(None)) -> None:
@ -101,9 +106,10 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
    if request_data.stream:
        async def generator():
-            async with streaming_semaphore:
+            async with text_generation_semaphore:
-                response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
+                partial = functools.partial(OAIcompletions.stream_completions, to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
+
                async for resp in generate_in_executor(partial):
                    disconnected = await request.is_disconnected()
                    if disconnected:
                        break
@ -113,7 +119,10 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
        return EventSourceResponse(generator())  # SSE streaming
    else:
-        response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
+        async with text_generation_semaphore:
            partial = functools.partial(OAIcompletions.completions, to_dict(request_data), is_legacy=is_legacy)
            response = await run_in_executor(partial)
        return JSONResponse(response)
@ -124,9 +133,10 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
    if request_data.stream:
        async def generator():
-            async with streaming_semaphore:
+            async with text_generation_semaphore:
-                response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
+                partial = functools.partial(OAIcompletions.stream_chat_completions, to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
+
                async for resp in generate_in_executor(partial):
                    disconnected = await request.is_disconnected()
                    if disconnected:
                        break
@ -136,7 +146,10 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
        return EventSourceResponse(generator())  # SSE streaming
    else:
-        response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
+        async with text_generation_semaphore:
            partial = functools.partial(OAIcompletions.chat_completions, to_dict(request_data), is_legacy=is_legacy)
            response = await run_in_executor(partial)
        return JSONResponse(response)
@ -182,7 +195,10 @@ async def handle_audio_transcription(request: Request):
    transcription = {"text": ""}
    try:
-        transcription["text"] = r.recognize_whisper(audio_data, language=whisper_language, model=whisper_model)
+        async with stt_semaphore:
            partial = functools.partial(r.recognize_whisper, audio_data, language=whisper_language, model=whisper_model)
            transcription["text"] = await run_in_executor(partial)
    except sr.UnknownValueError:
        print("Whisper could not understand audio")
        transcription["text"] = "Whisper could not understand audio UnknownValueError"
@ -205,7 +221,8 @@ async def handle_image_generation(request: Request):
    response_format = body.get('response_format', 'url')  # or b64_json
    n = body.get('n', 1)  # ignore the batch limits of max 10
-    response = await OAIimages.generations(prompt=prompt, size=size, response_format=response_format, n=n)
+    partial = functools.partial(OAIimages.generations, prompt=prompt, size=size, response_format=response_format, n=n)
    response = await run_in_executor(partial)
    return JSONResponse(response)
@ -218,7 +235,10 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
    if type(input) is str:
        input = [input]
-    response = OAIembeddings.embeddings(input, request_data.encoding_format)
+    async with embedding_semaphore:
        partial = functools.partial(OAIembeddings.embeddings, input, request_data.encoding_format)
        response = await run_in_executor(partial)
    return JSONResponse(response)
@ -229,25 +249,37 @@ async def handle_moderations(request: Request):
    if not input:
        raise HTTPException(status_code=400, detail="Missing required argument input")
-    response = OAImoderations.moderations(input)
+    async with embedding_semaphore:
        partial = functools.partial(OAImoderations.moderations, input)
        response = await run_in_executor(partial)
    return JSONResponse(response)
@app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key)
 async def handle_token_encode(request_data: EncodeRequest):
-    response = token_encode(request_data.text)
+    async with small_tasks_semaphore:
        partial = functools.partial(token_encode, request_data.text)
        response = await run_in_executor(partial)
    return JSONResponse(response)
@app.post("/v1/internal/decode", response_model=DecodeResponse, dependencies=check_key)
 async def handle_token_decode(request_data: DecodeRequest):
-    response = token_decode(request_data.tokens)
+    async with small_tasks_semaphore:
        partial = functools.partial(token_decode, request_data.tokens)
        response = await run_in_executor(partial)
    return JSONResponse(response)
@app.post("/v1/internal/token-count", response_model=TokenCountResponse, dependencies=check_key)
 async def handle_token_count(request_data: EncodeRequest):
-    response = token_count(request_data.text)
+    async with small_tasks_semaphore:
        partial = functools.partial(token_count, request_data.text)
        response = await run_in_executor(partial)
    return JSONResponse(response)
@ -257,7 +289,10 @@ async def handle_logits(request_data: LogitsRequest):
    Given a prompt, returns the top 50 most likely logits as a dict.
    The keys are the tokens, and the values are the probabilities.
    '''
-    response = OAIlogits._get_next_logits(to_dict(request_data))
+    async with small_tasks_semaphore:
        partial = functools.partial(OAIlogits._get_next_logits, to_dict(request_data))
        response = await run_in_executor(partial)
    return JSONResponse(response)
@ -265,8 +300,14 @@ async def handle_logits(request_data: LogitsRequest):
 async def handle_chat_prompt(request: Request, request_data: ChatCompletionRequest):
    path = request.url.path
    is_legacy = "/generate" in path
-    generator = OAIcompletions.chat_completions_common(to_dict(request_data), is_legacy=is_legacy, prompt_only=True)
+    async with small_tasks_semaphore:
-    response = deque(generator, maxlen=1).pop()
+        # Run in executor as there are calls to get_encoded_length
        # which might slow down at really long contexts.
        partial = functools.partial(OAIcompletions.chat_completions_common, to_dict(request_data), is_legacy=is_legacy, prompt_only=True)
        generator = await run_in_executor(partial)
        response = deque(generator, maxlen=1).pop()
    return JSONResponse(response)
@ -318,7 +359,10 @@ async def handle_load_model(request_data: LoadModelRequest):
    '''
    try:
-        OAImodels._load_model(to_dict(request_data))
+        async with io_semaphore:
            partial = functools.partial(OAImodels._load_model, to_dict(request_data))
            await run_in_executor(partial)
        return JSONResponse(content="OK")
    except:
        traceback.print_exc()
@ -327,7 +371,8 @@ async def handle_load_model(request_data: LoadModelRequest):
@app.post("/v1/internal/model/unload", dependencies=check_admin_key)
 async def handle_unload_model():
-    unload_model()
+    async with io_semaphore:
        await run_in_executor(unload_model)
@app.get("/v1/internal/lora/list", response_model=LoraListResponse, dependencies=check_admin_key)
@ -339,7 +384,10 @@ async def handle_list_loras():
@app.post("/v1/internal/lora/load", dependencies=check_admin_key)
 async def handle_load_loras(request_data: LoadLorasRequest):
    try:
-        OAImodels.load_loras(request_data.lora_names)
+        async with io_semaphore:
            partial = functools.partial(OAImodels.load_loras, request_data.lora_names)
            await run_in_executor(partial)
        return JSONResponse(content="OK")
    except:
        traceback.print_exc()
@ -348,7 +396,9 @@ async def handle_load_loras(request_data: LoadLorasRequest):
@app.post("/v1/internal/lora/unload", dependencies=check_admin_key)
 async def handle_unload_loras():
-    OAImodels.unload_all_loras()
+    async with io_semaphore:
        await run_in_executor(OAImodels.unload_all_loras)
    return JSONResponse(content="OK")
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@ -2,9 +2,13 @@ import base64
 import os
 import time
 import traceback
-from typing import Callable, Optional
+from typing import Callable, Optional, AsyncGenerator, Generator
 import numpy as np
 from modules import shared
 from functools import partial
 import asyncio
 from asyncio import AbstractEventLoop, Future
 def float_list_to_base64(float_array: np.ndarray) -> str:
@ -52,3 +56,37 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
            time.sleep(3)
        raise Exception('Could not start cloudflared.')
 def get_next_generator_result(gen: Generator) -> tuple[any, bool]:
    """
    Because StopIteration interacts badly with generators and cannot be raised into a Future
    """
    try:
        result = next(gen)
        return result, False
    except StopIteration:
        return None, True
 async def generate_in_executor(partial: partial, loop: AbstractEventLoop = None) -> AsyncGenerator[any, any]:
    """
    Converts a blocking generator to an async one
    """
    loop = loop or asyncio.get_running_loop()
    gen = await loop.run_in_executor(None, partial)
    while not shared.stop_everything:
        result, is_done = await loop.run_in_executor(None, get_next_generator_result, gen)
        if is_done:
            break
        yield result
 async def run_in_executor(partial: partial, loop: AbstractEventLoop = None) -> Future:
    """
    Runs a blocking function in a new thread so it can be awaited.
    """
    loop = loop or asyncio.get_running_loop()
    return await loop.run_in_executor(None, partial)