text-generation-webui/api-examples/api-example-model.py

#!/usr/bin/env python3

import requests

HOST = '0.0.0.0:5000'


def generate(prompt, tokens=200):
    request = {'prompt': prompt, 'max_new_tokens': tokens}
    response = requests.post(f'http://{HOST}/api/v1/generate', json=request)

    if response.status_code == 200:
        return response.json()['results'][0]['text']


def model_api(request):
    response = requests.post(f'http://{HOST}/api/v1/model', json=request)
    return response.json()


# print some common settings
def print_basic_model_info(response):
    basic_settings = ['truncation_length', 'instruction_template']
    print("Model: ", response['result']['model_name'])
    print("Lora(s): ", response['result']['lora_names'])
    for setting in basic_settings:
        print(setting, "=", response['result']['shared.settings'][setting])


# model info
def model_info():
    response = model_api({'action': 'info'})
    print_basic_model_info(response)


# simple loader
def model_load(model_name):
    return model_api({'action': 'load', 'model_name': model_name})


# complex loader
def complex_model_load(model):

    def guess_groupsize(model_name):
        if '1024g' in model_name:
            return 1024
        elif '128g' in model_name:
            return 128
        elif '32g' in model_name:
            return 32
        else:
            return -1

    req = {
        'action': 'load',
        'model_name': model,
        'args': {
            'loader': 'AutoGPTQ',

            'bf16': False,
            'load_in_8bit': False,
            'groupsize': 0,
            'wbits': 0,

            # llama.cpp
            'threads': 0,
            'n_batch': 512,
            'no_mmap': False,
            'mlock': False,
            'cache_capacity': None,
            'n_gpu_layers': 0,
            'n_ctx': 2048,

            # RWKV
            'rwkv_strategy': None,
            'rwkv_cuda_on': False,

            # b&b 4-bit
            # 'load_in_4bit': False,
            # 'compute_dtype': 'float16',
            # 'quant_type': 'nf4',
            # 'use_double_quant': False,

            # "cpu": false,
            # "auto_devices": false,
            # "gpu_memory": null,
            # "cpu_memory": null,
            # "disk": false,
            # "disk_cache_dir": "cache",
        },
    }

    model = model.lower()

    if '4bit' in model or 'gptq' in model or 'int4' in model:
        req['args']['wbits'] = 4
        req['args']['groupsize'] = guess_groupsize(model)
    elif '3bit' in model:
        req['args']['wbits'] = 3
        req['args']['groupsize'] = guess_groupsize(model)
    else:
        req['args']['gptq_for_llama'] = False

    if '8bit' in model:
        req['args']['load_in_8bit'] = True
    elif '-hf' in model or 'fp16' in model:
        if '7b' in model:
            req['args']['bf16'] = True  # for 24GB
        elif '13b' in model:
            req['args']['load_in_8bit'] = True  # for 24GB
    elif 'ggml' in model:
        # req['args']['threads'] = 16
        if '7b' in model:
            req['args']['n_gpu_layers'] = 100
        elif '13b' in model:
            req['args']['n_gpu_layers'] = 100
        elif '30b' in model or '33b' in model:
            req['args']['n_gpu_layers'] = 59  # 24GB
        elif '65b' in model:
            req['args']['n_gpu_layers'] = 42  # 24GB
    elif 'rwkv' in model:
        req['args']['rwkv_cuda_on'] = True
        if '14b' in model:
            req['args']['rwkv_strategy'] = 'cuda f16i8'  # 24GB
        else:
            req['args']['rwkv_strategy'] = 'cuda f16'  # 24GB

    return model_api(req)


if __name__ == '__main__':
    for model in model_api({'action': 'list'})['result']:
        try:
            resp = complex_model_load(model)

            if 'error' in resp:
                print(f"❌ {model} FAIL Error: {resp['error']['message']}")
                continue
            else:
                print_basic_model_info(resp)

            ans = generate("0,1,1,2,3,5,8,13,", tokens=2)

            if '21' in ans:
                print(f"✅ {model} PASS ({ans})")
            else:
                print(f"❌ {model} FAIL ({ans})")

        except Exception as e:
            print(f"❌ {model} FAIL Exception: {repr(e)}")


# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
# Some results below.
""" $ ./model-api-example.py
Model:  4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
Lora(s):  []
truncation_length = 2048
instruction_template = Alpaca
✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
Model:  4bit_WizardLM-13B-Uncensored-4bit-128g
Lora(s):  []
truncation_length = 2048
instruction_template = WizardLM
✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
Model:  Aeala_VicUnlocked-alpaca-30b-4bit
Lora(s):  []
truncation_length = 2048
instruction_template = Alpaca
✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
Model:  alpaca-30b-4bit
Lora(s):  []
truncation_length = 2048
instruction_template = Alpaca
✅ alpaca-30b-4bit PASS (21)
"""
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`#!/usr/bin/env python3`

			`import requests`

			`HOST = '0.0.0.0:5000'`

Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00
			`def generate(prompt, tokens=200):`
			`request = {'prompt': prompt, 'max_new_tokens': tokens}`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`response = requests.post(f'http://{HOST}/api/v1/generate', json=request)`

			`if response.status_code == 200:`
			`return response.json()['results'][0]['text']`


			`def model_api(request):`
			`response = requests.post(f'http://{HOST}/api/v1/model', json=request)`
			`return response.json()`


			`# print some common settings`
			`def print_basic_model_info(response):`
			`basic_settings = ['truncation_length', 'instruction_template']`
			`print("Model: ", response['result']['model_name'])`
			`print("Lora(s): ", response['result']['lora_names'])`
			`for setting in basic_settings:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`print(setting, "=", response['result']['shared.settings'][setting])`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00

			`# model info`
			`def model_info():`
			`response = model_api({'action': 'info'})`
			`print_basic_model_info(response)`


			`# simple loader`
			`def model_load(model_name):`
			`return model_api({'action': 'load', 'model_name': model_name})`


			`# complex loader`
			`def complex_model_load(model):`

			`def guess_groupsize(model_name):`
			`if '1024g' in model_name:`
			`return 1024`
			`elif '128g' in model_name:`
			`return 128`
			`elif '32g' in model_name:`
			`return 32`
			`else:`
			`return -1`

			`req = {`
			`'action': 'load',`
			`'model_name': model,`
			`'args': {`
Fix API example for loading models (#3101) 2023-07-11 23:40:55 +02:00			`'loader': 'AutoGPTQ',`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`'bf16': False,`
			`'load_in_8bit': False,`
			`'groupsize': 0,`
			`'wbits': 0,`

			`# llama.cpp`
			`'threads': 0,`
			`'n_batch': 512,`
			`'no_mmap': False,`
			`'mlock': False,`
			`'cache_capacity': None,`
			`'n_gpu_layers': 0,`
			`'n_ctx': 2048,`

			`# RWKV`
			`'rwkv_strategy': None,`
			`'rwkv_cuda_on': False,`

Fix API example for loading models (#3101) 2023-07-11 23:40:55 +02:00			`# b&b 4-bit`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`# 'load_in_4bit': False,`
			`# 'compute_dtype': 'float16',`
			`# 'quant_type': 'nf4',`
			`# 'use_double_quant': False,`

			`# "cpu": false,`
			`# "auto_devices": false,`
			`# "gpu_memory": null,`
			`# "cpu_memory": null,`
			`# "disk": false,`
			`# "disk_cache_dir": "cache",`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`},`
			`}`

			`model = model.lower()`

			`if '4bit' in model or 'gptq' in model or 'int4' in model:`
			`req['args']['wbits'] = 4`
			`req['args']['groupsize'] = guess_groupsize(model)`
			`elif '3bit' in model:`
			`req['args']['wbits'] = 3`
			`req['args']['groupsize'] = guess_groupsize(model)`
			`else:`
			`req['args']['gptq_for_llama'] = False`

			`if '8bit' in model:`
			`req['args']['load_in_8bit'] = True`
			`elif '-hf' in model or 'fp16' in model:`
			`if '7b' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['bf16'] = True # for 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`elif '13b' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['load_in_8bit'] = True # for 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`elif 'ggml' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`# req['args']['threads'] = 16`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`if '7b' in model:`
			`req['args']['n_gpu_layers'] = 100`
			`elif '13b' in model:`
			`req['args']['n_gpu_layers'] = 100`
			`elif '30b' in model or '33b' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['n_gpu_layers'] = 59 # 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`elif '65b' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['n_gpu_layers'] = 42 # 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`elif 'rwkv' in model:`
			`req['args']['rwkv_cuda_on'] = True`
			`if '14b' in model:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`else:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`req['args']['rwkv_strategy'] = 'cuda f16' # 24GB`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`return model_api(req)`


			`if __name__ == '__main__':`
			`for model in model_api({'action': 'list'})['result']:`
			`try:`
			`resp = complex_model_load(model)`

			`if 'error' in resp:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`print(f"❌ {model} FAIL Error: {resp['error']['message']}")`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`continue`
			`else:`
			`print_basic_model_info(resp)`

			`ans = generate("0,1,1,2,3,5,8,13,", tokens=2)`

			`if '21' in ans:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`print(f"✅ {model} PASS ({ans})")`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`else:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`print(f"❌ {model} FAIL ({ans})")`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`except Exception as e:`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`print(f"❌ {model} FAIL Exception: {repr(e)}")`
Fix API example for loading models (#3101) 2023-07-11 23:40:55 +02:00
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00
			`# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.`
			`# Some results below.`
Fix API example for loading models (#3101) 2023-07-11 23:40:55 +02:00			`""" $ ./model-api-example.py`
extensions/api: models api for blocking_api (updated) (#2539) 2023-06-08 16:34:36 +02:00			`Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda`
			`Lora(s): []`
			`truncation_length = 2048`
			`instruction_template = Alpaca`
			`✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)`
			`Model: 4bit_WizardLM-13B-Uncensored-4bit-128g`
			`Lora(s): []`
			`truncation_length = 2048`
			`instruction_template = WizardLM`
			`✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)`
			`Model: Aeala_VicUnlocked-alpaca-30b-4bit`
			`Lora(s): []`
			`truncation_length = 2048`
			`instruction_template = Alpaca`
			`✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)`
			`Model: alpaca-30b-4bit`
			`Lora(s): []`
			`truncation_length = 2048`
			`instruction_template = Alpaca`
			`✅ alpaca-30b-4bit PASS (21)`
			`"""`