From 7be6fe126b9feec78e77c22a8364fa049587b776 Mon Sep 17 00:00:00 2001
From: matatonic <73265741+matatonic@users.noreply.github.com>
Date: Thu, 8 Jun 2023 10:34:36 -0400
Subject: [PATCH] extensions/api: models api for blocking_api (updated) (#2539)

---
 api-examples/api-example-chat.py              |   2 +-
 api-examples/api-example-model.py             | 176 ++++++++++++++++++
 extensions/api/blocking_api.py                |  73 ++++++++
 .../multimodal/pipelines/llava/llava.py       |   7 +-
 4 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100755 api-examples/api-example-model.py

diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 905fbca6..8ea6ed1e 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -7,7 +7,7 @@ HOST = 'localhost:5000'
 URI = f'http://{HOST}/api/v1/chat'
 
 # For reverse-proxied streaming, the remote will likely host with ssl - https://
-# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
+# URI = 'https://your-uri-here.trycloudflare.com/api/v1/chat'
 
 
 def run(user_input, history):
diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py
new file mode 100755
index 00000000..8e1e3002
--- /dev/null
+++ b/api-examples/api-example-model.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+import requests
+
+HOST = '0.0.0.0:5000'
+
+def generate(prompt, tokens = 200):
+    request = { 'prompt': prompt, 'max_new_tokens': tokens }
+    response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
+
+    if response.status_code == 200:
+        return response.json()['results'][0]['text']
+
+
+def model_api(request):
+    response = requests.post(f'http://{HOST}/api/v1/model', json=request)
+    return response.json()
+
+
+# print some common settings
+def print_basic_model_info(response):
+    basic_settings = ['truncation_length', 'instruction_template']
+    print("Model: ", response['result']['model_name'])
+    print("Lora(s): ", response['result']['lora_names'])
+    for setting in basic_settings:
+        print(setting, "=",  response['result']['shared.settings'][setting])
+
+
+# model info
+def model_info():
+    response = model_api({'action': 'info'})
+    print_basic_model_info(response)
+
+
+# simple loader
+def model_load(model_name):
+    return model_api({'action': 'load', 'model_name': model_name})
+
+
+# complex loader
+def complex_model_load(model):
+
+    def guess_groupsize(model_name):
+        if '1024g' in model_name:
+            return 1024
+        elif '128g' in model_name:
+            return 128
+        elif '32g' in model_name:
+            return 32
+        else:
+            return -1
+
+    req = {
+        'action': 'load',
+        'model_name': model,
+        'args': {
+            'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
+
+            'bf16': False,
+            'load_in_8bit': False,
+            'groupsize': 0,
+            'wbits': 0,
+
+            # llama.cpp
+            'threads': 0,
+            'n_batch': 512,
+            'no_mmap': False,
+            'mlock': False,
+            'cache_capacity': None,
+            'n_gpu_layers': 0,
+            'n_ctx': 2048,
+
+            # RWKV
+            'rwkv_strategy': None,
+            'rwkv_cuda_on': False,
+
+            # b&b 4-bit 
+            #'load_in_4bit': False,
+            #'compute_dtype': 'float16',
+            #'quant_type': 'nf4',
+            #'use_double_quant': False,
+
+            #"cpu": false,
+            #"auto_devices": false,
+            #"gpu_memory": null,
+            #"cpu_memory": null,
+            #"disk": false,
+            #"disk_cache_dir": "cache",
+        },
+    }
+
+    model = model.lower()
+
+    if '4bit' in model or 'gptq' in model or 'int4' in model:
+        req['args']['wbits'] = 4
+        req['args']['groupsize'] = guess_groupsize(model)
+    elif '3bit' in model:
+        req['args']['wbits'] = 3
+        req['args']['groupsize'] = guess_groupsize(model)
+    else:
+        req['args']['gptq_for_llama'] = False
+
+    if '8bit' in model:
+        req['args']['load_in_8bit'] = True
+    elif '-hf' in model or 'fp16' in model:
+        if '7b' in model:
+            req['args']['bf16'] = True # for 24GB
+        elif '13b' in model:
+            req['args']['load_in_8bit'] = True # for 24GB
+    elif 'ggml' in model:
+        #req['args']['threads'] = 16
+        if '7b' in model:
+            req['args']['n_gpu_layers'] = 100
+        elif '13b' in model:
+            req['args']['n_gpu_layers'] = 100
+        elif '30b' in model or '33b' in model:
+            req['args']['n_gpu_layers'] = 59 # 24GB
+        elif '65b' in model:
+            req['args']['n_gpu_layers'] = 42 # 24GB
+    elif 'rwkv' in model:
+        req['args']['rwkv_cuda_on'] = True
+        if '14b' in model:
+            req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
+        else:
+            req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
+
+
+    return model_api(req)
+
+
+if __name__ == '__main__':
+    for model in model_api({'action': 'list'})['result']:
+        try:
+            resp = complex_model_load(model)
+
+            if 'error' in resp:
+                print (f"❌ {model} FAIL Error: {resp['error']['message']}")
+                continue
+            else:
+                print_basic_model_info(resp)
+
+            ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
+
+            if '21' in ans:
+                print (f"✅ {model} PASS ({ans})")
+            else:
+                print (f"❌ {model} FAIL ({ans})")
+
+        except Exception as e:
+            print (f"❌ {model} FAIL Exception: {repr(e)}")
+            
+
+# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
+# Some results below.
+""" $ ./model-api-example.py 
+Model:  4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
+Model:  4bit_WizardLM-13B-Uncensored-4bit-128g
+Lora(s):  []
+truncation_length = 2048
+instruction_template = WizardLM
+✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
+Model:  Aeala_VicUnlocked-alpaca-30b-4bit
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
+Model:  alpaca-30b-4bit
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ alpaca-30b-4bit PASS (21)
+"""
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index 6bcd840c..c787cd0f 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -6,7 +6,19 @@ from extensions.api.util import build_parameters, try_start_cloudflared
 from modules import shared
 from modules.chat import generate_chat_reply
 from modules.text_generation import encode, generate_reply, stop_everything_event
+from modules.models import load_model, unload_model
+from modules.LoRA import add_lora_to_model
+from modules.utils import get_available_models
+from server import get_model_specific_settings, update_model_parameters
 
+def get_model_info():
+    return {
+        'model_name': shared.model_name,
+        'lora_names': shared.lora_names,
+        # dump
+        'shared.settings': shared.settings,
+        'shared.args': vars(shared.args),
+    }
 
 class Handler(BaseHTTPRequestHandler):
     def do_GET(self):
@@ -91,6 +103,67 @@ class Handler(BaseHTTPRequestHandler):
 
             self.wfile.write(response.encode('utf-8'))
 
+        elif self.path == '/api/v1/model':
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.end_headers()
+
+            # by default return the same as the GET interface
+            result = shared.model_name
+
+            # Actions: info, load, list, unload
+            action = body.get('action', '')
+
+            if action == 'load':
+                model_name = body['model_name']
+                args = body.get('args', {})
+                print('args', args)
+                for k in args:
+                    setattr(shared.args, k, args[k])
+
+                shared.model_name = model_name
+                unload_model()
+
+                model_settings = get_model_specific_settings(shared.model_name)
+                shared.settings.update(model_settings)
+                update_model_parameters(model_settings, initial=True)
+
+                if shared.settings['mode'] != 'instruct':
+                    shared.settings['instruction_template'] = None
+
+                try:
+                    shared.model, shared.tokenizer = load_model(shared.model_name)
+                    if shared.args.lora:
+                        add_lora_to_model(shared.args.lora) # list
+
+                except Exception as e:
+                    response = json.dumps({'error': { 'message': repr(e) } })
+
+                    self.wfile.write(response.encode('utf-8'))
+                    raise e
+
+                shared.args.model = shared.model_name
+
+                result = get_model_info()
+
+            elif action == 'unload':
+                unload_model()
+                shared.model_name = None
+                shared.args.model = None
+                result = get_model_info()
+
+            elif action == 'list':
+                result = get_available_models()
+
+            elif action == 'info':
+                result = get_model_info()
+
+            response = json.dumps({
+                'result': result,
+            })
+
+            self.wfile.write(response.encode('utf-8'))
+
         elif self.path == '/api/v1/token-count':
             self.send_response(200)
             self.send_header('Content-Type', 'application/json')
diff --git a/extensions/multimodal/pipelines/llava/llava.py b/extensions/multimodal/pipelines/llava/llava.py
index 16f0e06f..eca2be50 100644
--- a/extensions/multimodal/pipelines/llava/llava.py
+++ b/extensions/multimodal/pipelines/llava/llava.py
@@ -56,7 +56,12 @@ class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
 
     @staticmethod
     def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
-        return shared.model.model.embed_tokens(input_ids).to(shared.model.device, dtype=shared.model.dtype)
+        if hasattr(shared.model.model, 'embed_tokens'):
+            func = shared.model.model.embed_tokens
+        else:
+            func = shared.model.model.model.embed_tokens  # AutoGPTQ case
+
+        return func(input_ids).to(shared.model.device, dtype=shared.model.dtype)
 
     @staticmethod
     def placeholder_embeddings() -> torch.Tensor: