extensions/api: models api for blocking_api (updated) (#2539)

2024-11-25 09:19:23 +01:00 · 2023-06-08 10:34:36 -04:00 · 2023-06-08 10:34:36 -04:00 · 7be6fe126b
commit 7be6fe126b
parent 084b006cfe
4 changed files with 256 additions and 2 deletions
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@ -7,7 +7,7 @@ HOST = 'localhost:5000'
 URI = f'http://{HOST}/api/v1/chat'
 # For reverse-proxied streaming, the remote will likely host with ssl - https://
-# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
+# URI = 'https://your-uri-here.trycloudflare.com/api/v1/chat'
 def run(user_input, history):
--- a/api-examples/api-example-model.py
+++ b/api-examples/api-example-model.py
@ -0,0 +1,176 @@
 #!/usr/bin/env python3
 import requests
 HOST = '0.0.0.0:5000'
 def generate(prompt, tokens = 200):
    request = { 'prompt': prompt, 'max_new_tokens': tokens }
    response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
    if response.status_code == 200:
        return response.json()['results'][0]['text']
 def model_api(request):
    response = requests.post(f'http://{HOST}/api/v1/model', json=request)
    return response.json()
 # print some common settings
 def print_basic_model_info(response):
    basic_settings = ['truncation_length', 'instruction_template']
    print("Model: ", response['result']['model_name'])
    print("Lora(s): ", response['result']['lora_names'])
    for setting in basic_settings:
        print(setting, "=",  response['result']['shared.settings'][setting])
 # model info
 def model_info():
    response = model_api({'action': 'info'})
    print_basic_model_info(response)
 # simple loader
 def model_load(model_name):
    return model_api({'action': 'load', 'model_name': model_name})
 # complex loader
 def complex_model_load(model):
    def guess_groupsize(model_name):
        if '1024g' in model_name:
            return 1024
        elif '128g' in model_name:
            return 128
        elif '32g' in model_name:
            return 32
        else:
            return -1
    req = {
        'action': 'load',
        'model_name': model,
        'args': {
            'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
            'bf16': False,
            'load_in_8bit': False,
            'groupsize': 0,
            'wbits': 0,
            # llama.cpp
            'threads': 0,
            'n_batch': 512,
            'no_mmap': False,
            'mlock': False,
            'cache_capacity': None,
            'n_gpu_layers': 0,
            'n_ctx': 2048,
            # RWKV
            'rwkv_strategy': None,
            'rwkv_cuda_on': False,
            # b&b 4-bit 
            #'load_in_4bit': False,
            #'compute_dtype': 'float16',
            #'quant_type': 'nf4',
            #'use_double_quant': False,
            #"cpu": false,
            #"auto_devices": false,
            #"gpu_memory": null,
            #"cpu_memory": null,
            #"disk": false,
            #"disk_cache_dir": "cache",
        },
    }
    model = model.lower()
    if '4bit' in model or 'gptq' in model or 'int4' in model:
        req['args']['wbits'] = 4
        req['args']['groupsize'] = guess_groupsize(model)
    elif '3bit' in model:
        req['args']['wbits'] = 3
        req['args']['groupsize'] = guess_groupsize(model)
    else:
        req['args']['gptq_for_llama'] = False
    if '8bit' in model:
        req['args']['load_in_8bit'] = True
    elif '-hf' in model or 'fp16' in model:
        if '7b' in model:
            req['args']['bf16'] = True # for 24GB
        elif '13b' in model:
            req['args']['load_in_8bit'] = True # for 24GB
    elif 'ggml' in model:
        #req['args']['threads'] = 16
        if '7b' in model:
            req['args']['n_gpu_layers'] = 100
        elif '13b' in model:
            req['args']['n_gpu_layers'] = 100
        elif '30b' in model or '33b' in model:
            req['args']['n_gpu_layers'] = 59 # 24GB
        elif '65b' in model:
            req['args']['n_gpu_layers'] = 42 # 24GB
    elif 'rwkv' in model:
        req['args']['rwkv_cuda_on'] = True
        if '14b' in model:
            req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
        else:
            req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
    return model_api(req)
 if __name__ == '__main__':
    for model in model_api({'action': 'list'})['result']:
        try:
            resp = complex_model_load(model)
            if 'error' in resp:
                print (f"❌ {model} FAIL Error: {resp['error']['message']}")
                continue
            else:
                print_basic_model_info(resp)
            ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
            if '21' in ans:
                print (f"✅ {model} PASS ({ans})")
            else:
                print (f"❌ {model} FAIL ({ans})")
        except Exception as e:
            print (f"❌ {model} FAIL Exception: {repr(e)}")
 # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
 # Some results below.
 """ $ ./model-api-example.py 
 Model:  4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
 Lora(s):  []
 truncation_length = 2048
 instruction_template = Alpaca
 ✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
 Model:  4bit_WizardLM-13B-Uncensored-4bit-128g
 Lora(s):  []
 truncation_length = 2048
 instruction_template = WizardLM
 ✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
 Model:  Aeala_VicUnlocked-alpaca-30b-4bit
 Lora(s):  []
 truncation_length = 2048
 instruction_template = Alpaca
 ✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
 Model:  alpaca-30b-4bit
 Lora(s):  []
 truncation_length = 2048
 instruction_template = Alpaca
 ✅ alpaca-30b-4bit PASS (21)
 """
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@ -6,7 +6,19 @@ from extensions.api.util import build_parameters, try_start_cloudflared
 from modules import shared
 from modules.chat import generate_chat_reply
 from modules.text_generation import encode, generate_reply, stop_everything_event
 from modules.models import load_model, unload_model
 from modules.LoRA import add_lora_to_model
 from modules.utils import get_available_models
 from server import get_model_specific_settings, update_model_parameters
 def get_model_info():
    return {
        'model_name': shared.model_name,
        'lora_names': shared.lora_names,
        # dump
        'shared.settings': shared.settings,
        'shared.args': vars(shared.args),
    }
 class Handler(BaseHTTPRequestHandler):
    def do_GET(self):
@ -91,6 +103,67 @@ class Handler(BaseHTTPRequestHandler):
            self.wfile.write(response.encode('utf-8'))
        elif self.path == '/api/v1/model':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
            self.end_headers()
            # by default return the same as the GET interface
            result = shared.model_name
            # Actions: info, load, list, unload
            action = body.get('action', '')
            if action == 'load':
                model_name = body['model_name']
                args = body.get('args', {})
                print('args', args)
                for k in args:
                    setattr(shared.args, k, args[k])
                shared.model_name = model_name
                unload_model()
                model_settings = get_model_specific_settings(shared.model_name)
                shared.settings.update(model_settings)
                update_model_parameters(model_settings, initial=True)
                if shared.settings['mode'] != 'instruct':
                    shared.settings['instruction_template'] = None
                try:
                    shared.model, shared.tokenizer = load_model(shared.model_name)
                    if shared.args.lora:
                        add_lora_to_model(shared.args.lora) # list
                except Exception as e:
                    response = json.dumps({'error': { 'message': repr(e) } })
                    self.wfile.write(response.encode('utf-8'))
                    raise e
                shared.args.model = shared.model_name
                result = get_model_info()
            elif action == 'unload':
                unload_model()
                shared.model_name = None
                shared.args.model = None
                result = get_model_info()
            elif action == 'list':
                result = get_available_models()
            elif action == 'info':
                result = get_model_info()
            response = json.dumps({
                'result': result,
            })
            self.wfile.write(response.encode('utf-8'))
        elif self.path == '/api/v1/token-count':
            self.send_response(200)
            self.send_header('Content-Type', 'application/json')
--- a/extensions/multimodal/pipelines/llava/llava.py
+++ b/extensions/multimodal/pipelines/llava/llava.py
@ -56,7 +56,12 @@ class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
    @staticmethod
    def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
-        return shared.model.model.embed_tokens(input_ids).to(shared.model.device, dtype=shared.model.dtype)
+        if hasattr(shared.model.model, 'embed_tokens'):
            func = shared.model.model.embed_tokens
        else:
            func = shared.model.model.model.embed_tokens  # AutoGPTQ case
        return func(input_ids).to(shared.model.device, dtype=shared.model.dtype)
    @staticmethod
    def placeholder_embeddings() -> torch.Tensor: