text-generation-webui/extensions/api/blocking_api.py

import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from threading import Thread

from extensions.api.util import build_parameters, try_start_cloudflared
from modules import shared
from modules.chat import generate_chat_reply
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model
from modules.models_settings import (
    get_model_settings_from_yamls,
    update_model_parameters
)
from modules.text_generation import (
    encode,
    generate_reply,
    stop_everything_event
)
from modules.utils import get_available_models


def get_model_info():
    return {
        'model_name': shared.model_name,
        'lora_names': shared.lora_names,
        # dump
        'shared.settings': shared.settings,
        'shared.args': vars(shared.args),
    }


class Handler(BaseHTTPRequestHandler):
    def do_GET(self):
        if self.path == '/api/v1/model':
            self.send_response(200)
            self.end_headers()
            response = json.dumps({
                'result': shared.model_name
            })

            self.wfile.write(response.encode('utf-8'))
        else:
            self.send_error(404)

    def do_POST(self):
        content_length = int(self.headers['Content-Length'])
        body = json.loads(self.rfile.read(content_length).decode('utf-8'))

        if self.path == '/api/v1/generate':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()

            prompt = body['prompt']
            generate_params = build_parameters(body)
            stopping_strings = generate_params.pop('stopping_strings')
            generate_params['stream'] = False

            generator = generate_reply(
                prompt, generate_params, stopping_strings=stopping_strings, is_chat=False)

            answer = ''
            for a in generator:
                answer = a

            response = json.dumps({
                'results': [{
                    'text': answer
                }]
            })

            self.wfile.write(response.encode('utf-8'))

        elif self.path == '/api/v1/chat':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()

            user_input = body['user_input']
            regenerate = body.get('regenerate', False)
            _continue = body.get('_continue', False)

            generate_params = build_parameters(body, chat=True)
            generate_params['stream'] = False

            generator = generate_chat_reply(
                user_input, generate_params, regenerate=regenerate, _continue=_continue, loading_message=False)

            answer = generate_params['history']
            for a in generator:
                answer = a

            response = json.dumps({
                'results': [{
                    'history': answer
                }]
            })

            self.wfile.write(response.encode('utf-8'))

        elif self.path == '/api/v1/stop-stream':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()

            stop_everything_event()

            response = json.dumps({
                'results': 'success'
            })

            self.wfile.write(response.encode('utf-8'))

        elif self.path == '/api/v1/model':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()

            # by default return the same as the GET interface
            result = shared.model_name

            # Actions: info, load, list, unload
            action = body.get('action', '')

            if action == 'load':
                model_name = body['model_name']
                args = body.get('args', {})
                print('args', args)
                for k in args:
                    setattr(shared.args, k, args[k])

                shared.model_name = model_name
                unload_model()

                model_settings = get_model_settings_from_yamls(shared.model_name)
                shared.settings.update(model_settings)
                update_model_parameters(model_settings, initial=True)

                if shared.settings['mode'] != 'instruct':
                    shared.settings['instruction_template'] = None

                try:
                    shared.model, shared.tokenizer = load_model(shared.model_name)
                    if shared.args.lora:
                        add_lora_to_model(shared.args.lora)  # list

                except Exception as e:
                    response = json.dumps({'error': {'message': repr(e)}})

                    self.wfile.write(response.encode('utf-8'))
                    raise e

                shared.args.model = shared.model_name

                result = get_model_info()

            elif action == 'unload':
                unload_model()
                shared.model_name = None
                shared.args.model = None
                result = get_model_info()

            elif action == 'list':
                result = get_available_models()

            elif action == 'info':
                result = get_model_info()

            response = json.dumps({
                'result': result,
            })

            self.wfile.write(response.encode('utf-8'))

        elif self.path == '/api/v1/token-count':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()

            tokens = encode(body['prompt'])[0]
            response = json.dumps({
                'results': [{
                    'tokens': len(tokens)
                }]
            })

            self.wfile.write(response.encode('utf-8'))
        else:
            self.send_error(404)

    def do_OPTIONS(self):
        self.send_response(200)
        self.end_headers()

    def end_headers(self):
        self.send_header('Access-Control-Allow-Origin', '*')
        self.send_header('Access-Control-Allow-Methods', '*')
        self.send_header('Access-Control-Allow-Headers', '*')
        self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
        super().end_headers()


def _run_server(port: int, share: bool = False):
    address = '0.0.0.0' if shared.args.listen else '127.0.0.1'

    server = ThreadingHTTPServer((address, port), Handler)

    def on_start(public_url: str):
        print(f'Starting non-streaming server at public url {public_url}/api')

    if share:
        try:
            try_start_cloudflared(port, max_attempts=3, on_start=on_start)
        except Exception:
            pass
    else:
        print(
            f'Starting API at http://{address}:{port}/api')

    server.serve_forever()


def start_server(port: int, share: bool = False):
    Thread(target=_run_server, args=[port, share], daemon=True).start()
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`import json`
			`from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer`
			`from threading import Thread`

Style improvements (#1957) 2023-05-10 03:49:39 +02:00			`from extensions.api.util import build_parameters, try_start_cloudflared`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`from modules import shared`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`from modules.chat import generate_chat_reply`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`from modules.LoRA import add_lora_to_model`
Minor cleanup 2023-06-09 05:30:22 +02:00			`from modules.models import load_model, unload_model`
Add extension example, replace input_hijack with chat_input_modifier (#3307) 2023-07-25 23:49:56 +02:00			`from modules.models_settings import (`
			`get_model_settings_from_yamls,`
			`update_model_parameters`
			`)`
			`from modules.text_generation import (`
			`encode,`
			`generate_reply,`
			`stop_everything_event`
			`)`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`from modules.utils import get_available_models`

Minor cleanup 2023-06-09 05:30:22 +02:00
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`def get_model_info():`
			`return {`
			`'model_name': shared.model_name,`
			`'lora_names': shared.lora_names,`
			`# dump`
			`'shared.settings': shared.settings,`
			`'shared.args': vars(shared.args),`
			`}`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
Reorganize model loading UI completely (#2720) 2023-06-17 00:00:37 +02:00
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`class Handler(BaseHTTPRequestHandler):`
			`def do_GET(self):`
			`if self.path == '/api/v1/model':`
			`self.send_response(200)`
			`self.end_headers()`
			`response = json.dumps({`
			`'result': shared.model_name`
			`})`

			`self.wfile.write(response.encode('utf-8'))`
			`else:`
			`self.send_error(404)`

			`def do_POST(self):`
			`content_length = int(self.headers['Content-Length'])`
			`body = json.loads(self.rfile.read(content_length).decode('utf-8'))`

			`if self.path == '/api/v1/generate':`
			`self.send_response(200)`
			`self.send_header('Content-Type', 'application/json')`
			`self.end_headers()`

			`prompt = body['prompt']`
			`generate_params = build_parameters(body)`
			`stopping_strings = generate_params.pop('stopping_strings')`
Refactor text_generation.py, add support for custom generation functions (#1817) 2023-05-05 23:53:03 +02:00			`generate_params['stream'] = False`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
			`generator = generate_reply(`
Refactor chat functions (#2003) 2023-05-11 20:37:04 +02:00			`prompt, generate_params, stopping_strings=stopping_strings, is_chat=False)`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
			`answer = ''`
			`for a in generator:`
Refactor chat functions (#2003) 2023-05-11 20:37:04 +02:00			`answer = a`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
			`response = json.dumps({`
			`'results': [{`
Always return only the new tokens in generation functions 2023-05-11 22:07:20 +02:00			`'text': answer`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`}]`
			`})`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`self.wfile.write(response.encode('utf-8'))`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00
			`elif self.path == '/api/v1/chat':`
			`self.send_response(200)`
			`self.send_header('Content-Type', 'application/json')`
			`self.end_headers()`

			`user_input = body['user_input']`
			`regenerate = body.get('regenerate', False)`
			`_continue = body.get('_continue', False)`

			`generate_params = build_parameters(body, chat=True)`
			`generate_params['stream'] = False`

			`generator = generate_chat_reply(`
Update chat API (fixes #3006) 2023-07-05 02:36:47 +02:00			`user_input, generate_params, regenerate=regenerate, _continue=_continue, loading_message=False)`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00
Update chat API (fixes #3006) 2023-07-05 02:36:47 +02:00			`answer = generate_params['history']`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`for a in generator:`
			`answer = a`

			`response = json.dumps({`
			`'results': [{`
			`'history': answer`
			`}]`
			`})`

			`self.wfile.write(response.encode('utf-8'))`

Add a `/api/v1/stop-stream` API that allows the user to interrupt the generation (#2392) 2023-05-31 03:03:40 +02:00			`elif self.path == '/api/v1/stop-stream':`
			`self.send_response(200)`
			`self.send_header('Content-Type', 'application/json')`
			`self.end_headers()`

			`stop_everything_event()`

			`response = json.dumps({`
			`'results': 'success'`
			`})`

			`self.wfile.write(response.encode('utf-8'))`

extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`elif self.path == '/api/v1/model':`
			`self.send_response(200)`
			`self.send_header('Content-Type', 'application/json')`
			`self.end_headers()`

			`# by default return the same as the GET interface`
			`result = shared.model_name`

			`# Actions: info, load, list, unload`
			`action = body.get('action', '')`

			`if action == 'load':`
			`model_name = body['model_name']`
			`args = body.get('args', {})`
			`print('args', args)`
			`for k in args:`
			`setattr(shared.args, k, args[k])`

			`shared.model_name = model_name`
			`unload_model()`

Reorganize model loading UI completely (#2720) 2023-06-17 00:00:37 +02:00			`model_settings = get_model_settings_from_yamls(shared.model_name)`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`shared.settings.update(model_settings)`
			`update_model_parameters(model_settings, initial=True)`

			`if shared.settings['mode'] != 'instruct':`
			`shared.settings['instruction_template'] = None`

			`try:`
			`shared.model, shared.tokenizer = load_model(shared.model_name)`
			`if shared.args.lora:`
Reorganize model loading UI completely (#2720) 2023-06-17 00:00:37 +02:00			`add_lora_to_model(shared.args.lora) # list`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`except Exception as e:`
Reorganize model loading UI completely (#2720) 2023-06-17 00:00:37 +02:00			`response = json.dumps({'error': {'message': repr(e)}})`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`self.wfile.write(response.encode('utf-8'))`
			`raise e`

			`shared.args.model = shared.model_name`

			`result = get_model_info()`

			`elif action == 'unload':`
			`unload_model()`
			`shared.model_name = None`
			`shared.args.model = None`
			`result = get_model_info()`

			`elif action == 'list':`
			`result = get_available_models()`

			`elif action == 'info':`
			`result = get_model_info()`

			`response = json.dumps({`
			`'result': result,`
			`})`

			`self.wfile.write(response.encode('utf-8'))`

New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`elif self.path == '/api/v1/token-count':`
			`self.send_response(200)`
			`self.send_header('Content-Type', 'application/json')`
			`self.end_headers()`

			`tokens = encode(body['prompt'])[0]`
			`response = json.dumps({`
			`'results': [{`
			`'tokens': len(tokens)`
			`}]`
			`})`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`self.wfile.write(response.encode('utf-8'))`
			`else:`
			`self.send_error(404)`

Add CORS support to the API (#2718) 2023-06-24 15:16:06 +02:00			`def do_OPTIONS(self):`
			`self.send_response(200)`
			`self.end_headers()`

			`def end_headers(self):`
			`self.send_header('Access-Control-Allow-Origin', '*')`
			`self.send_header('Access-Control-Allow-Methods', '*')`
			`self.send_header('Access-Control-Allow-Headers', '*')`
			`self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')`
			`super().end_headers()`

New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
Refactor text_generation.py, add support for custom generation functions (#1817) 2023-05-05 23:53:03 +02:00			`def _run_server(port: int, share: bool = False):`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`address = '0.0.0.0' if shared.args.listen else '127.0.0.1'`

			`server = ThreadingHTTPServer((address, port), Handler)`

			`def on_start(public_url: str):`
			`print(f'Starting non-streaming server at public url {public_url}/api')`

			`if share:`
			`try:`
			`try_start_cloudflared(port, max_attempts=3, on_start=on_start)`
			`except Exception:`
			`pass`
			`else:`
			`print(`
			`f'Starting API at http://{address}:{port}/api')`

			`server.serve_forever()`


			`def start_server(port: int, share: bool = False):`
			`Thread(target=_run_server, args=[port, share], daemon=True).start()`