mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 01:09:22 +01:00
extensions/api: models api for blocking_api (updated) (#2539)
This commit is contained in:
parent
084b006cfe
commit
7be6fe126b
@ -7,7 +7,7 @@ HOST = 'localhost:5000'
|
||||
URI = f'http://{HOST}/api/v1/chat'
|
||||
|
||||
# For reverse-proxied streaming, the remote will likely host with ssl - https://
|
||||
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
|
||||
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/chat'
|
||||
|
||||
|
||||
def run(user_input, history):
|
||||
|
176
api-examples/api-example-model.py
Executable file
176
api-examples/api-example-model.py
Executable file
@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
|
||||
HOST = '0.0.0.0:5000'
|
||||
|
||||
def generate(prompt, tokens = 200):
|
||||
request = { 'prompt': prompt, 'max_new_tokens': tokens }
|
||||
response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()['results'][0]['text']
|
||||
|
||||
|
||||
def model_api(request):
|
||||
response = requests.post(f'http://{HOST}/api/v1/model', json=request)
|
||||
return response.json()
|
||||
|
||||
|
||||
# print some common settings
|
||||
def print_basic_model_info(response):
|
||||
basic_settings = ['truncation_length', 'instruction_template']
|
||||
print("Model: ", response['result']['model_name'])
|
||||
print("Lora(s): ", response['result']['lora_names'])
|
||||
for setting in basic_settings:
|
||||
print(setting, "=", response['result']['shared.settings'][setting])
|
||||
|
||||
|
||||
# model info
|
||||
def model_info():
|
||||
response = model_api({'action': 'info'})
|
||||
print_basic_model_info(response)
|
||||
|
||||
|
||||
# simple loader
|
||||
def model_load(model_name):
|
||||
return model_api({'action': 'load', 'model_name': model_name})
|
||||
|
||||
|
||||
# complex loader
|
||||
def complex_model_load(model):
|
||||
|
||||
def guess_groupsize(model_name):
|
||||
if '1024g' in model_name:
|
||||
return 1024
|
||||
elif '128g' in model_name:
|
||||
return 128
|
||||
elif '32g' in model_name:
|
||||
return 32
|
||||
else:
|
||||
return -1
|
||||
|
||||
req = {
|
||||
'action': 'load',
|
||||
'model_name': model,
|
||||
'args': {
|
||||
'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
|
||||
|
||||
'bf16': False,
|
||||
'load_in_8bit': False,
|
||||
'groupsize': 0,
|
||||
'wbits': 0,
|
||||
|
||||
# llama.cpp
|
||||
'threads': 0,
|
||||
'n_batch': 512,
|
||||
'no_mmap': False,
|
||||
'mlock': False,
|
||||
'cache_capacity': None,
|
||||
'n_gpu_layers': 0,
|
||||
'n_ctx': 2048,
|
||||
|
||||
# RWKV
|
||||
'rwkv_strategy': None,
|
||||
'rwkv_cuda_on': False,
|
||||
|
||||
# b&b 4-bit
|
||||
#'load_in_4bit': False,
|
||||
#'compute_dtype': 'float16',
|
||||
#'quant_type': 'nf4',
|
||||
#'use_double_quant': False,
|
||||
|
||||
#"cpu": false,
|
||||
#"auto_devices": false,
|
||||
#"gpu_memory": null,
|
||||
#"cpu_memory": null,
|
||||
#"disk": false,
|
||||
#"disk_cache_dir": "cache",
|
||||
},
|
||||
}
|
||||
|
||||
model = model.lower()
|
||||
|
||||
if '4bit' in model or 'gptq' in model or 'int4' in model:
|
||||
req['args']['wbits'] = 4
|
||||
req['args']['groupsize'] = guess_groupsize(model)
|
||||
elif '3bit' in model:
|
||||
req['args']['wbits'] = 3
|
||||
req['args']['groupsize'] = guess_groupsize(model)
|
||||
else:
|
||||
req['args']['gptq_for_llama'] = False
|
||||
|
||||
if '8bit' in model:
|
||||
req['args']['load_in_8bit'] = True
|
||||
elif '-hf' in model or 'fp16' in model:
|
||||
if '7b' in model:
|
||||
req['args']['bf16'] = True # for 24GB
|
||||
elif '13b' in model:
|
||||
req['args']['load_in_8bit'] = True # for 24GB
|
||||
elif 'ggml' in model:
|
||||
#req['args']['threads'] = 16
|
||||
if '7b' in model:
|
||||
req['args']['n_gpu_layers'] = 100
|
||||
elif '13b' in model:
|
||||
req['args']['n_gpu_layers'] = 100
|
||||
elif '30b' in model or '33b' in model:
|
||||
req['args']['n_gpu_layers'] = 59 # 24GB
|
||||
elif '65b' in model:
|
||||
req['args']['n_gpu_layers'] = 42 # 24GB
|
||||
elif 'rwkv' in model:
|
||||
req['args']['rwkv_cuda_on'] = True
|
||||
if '14b' in model:
|
||||
req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
|
||||
else:
|
||||
req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
|
||||
|
||||
|
||||
return model_api(req)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for model in model_api({'action': 'list'})['result']:
|
||||
try:
|
||||
resp = complex_model_load(model)
|
||||
|
||||
if 'error' in resp:
|
||||
print (f"❌ {model} FAIL Error: {resp['error']['message']}")
|
||||
continue
|
||||
else:
|
||||
print_basic_model_info(resp)
|
||||
|
||||
ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
|
||||
|
||||
if '21' in ans:
|
||||
print (f"✅ {model} PASS ({ans})")
|
||||
else:
|
||||
print (f"❌ {model} FAIL ({ans})")
|
||||
|
||||
except Exception as e:
|
||||
print (f"❌ {model} FAIL Exception: {repr(e)}")
|
||||
|
||||
|
||||
# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
|
||||
# Some results below.
|
||||
""" $ ./model-api-example.py
|
||||
Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
|
||||
Lora(s): []
|
||||
truncation_length = 2048
|
||||
instruction_template = Alpaca
|
||||
✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
|
||||
Model: 4bit_WizardLM-13B-Uncensored-4bit-128g
|
||||
Lora(s): []
|
||||
truncation_length = 2048
|
||||
instruction_template = WizardLM
|
||||
✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
|
||||
Model: Aeala_VicUnlocked-alpaca-30b-4bit
|
||||
Lora(s): []
|
||||
truncation_length = 2048
|
||||
instruction_template = Alpaca
|
||||
✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
|
||||
Model: alpaca-30b-4bit
|
||||
Lora(s): []
|
||||
truncation_length = 2048
|
||||
instruction_template = Alpaca
|
||||
✅ alpaca-30b-4bit PASS (21)
|
||||
"""
|
@ -6,7 +6,19 @@ from extensions.api.util import build_parameters, try_start_cloudflared
|
||||
from modules import shared
|
||||
from modules.chat import generate_chat_reply
|
||||
from modules.text_generation import encode, generate_reply, stop_everything_event
|
||||
from modules.models import load_model, unload_model
|
||||
from modules.LoRA import add_lora_to_model
|
||||
from modules.utils import get_available_models
|
||||
from server import get_model_specific_settings, update_model_parameters
|
||||
|
||||
def get_model_info():
|
||||
return {
|
||||
'model_name': shared.model_name,
|
||||
'lora_names': shared.lora_names,
|
||||
# dump
|
||||
'shared.settings': shared.settings,
|
||||
'shared.args': vars(shared.args),
|
||||
}
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
@ -91,6 +103,67 @@ class Handler(BaseHTTPRequestHandler):
|
||||
|
||||
self.wfile.write(response.encode('utf-8'))
|
||||
|
||||
elif self.path == '/api/v1/model':
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.end_headers()
|
||||
|
||||
# by default return the same as the GET interface
|
||||
result = shared.model_name
|
||||
|
||||
# Actions: info, load, list, unload
|
||||
action = body.get('action', '')
|
||||
|
||||
if action == 'load':
|
||||
model_name = body['model_name']
|
||||
args = body.get('args', {})
|
||||
print('args', args)
|
||||
for k in args:
|
||||
setattr(shared.args, k, args[k])
|
||||
|
||||
shared.model_name = model_name
|
||||
unload_model()
|
||||
|
||||
model_settings = get_model_specific_settings(shared.model_name)
|
||||
shared.settings.update(model_settings)
|
||||
update_model_parameters(model_settings, initial=True)
|
||||
|
||||
if shared.settings['mode'] != 'instruct':
|
||||
shared.settings['instruction_template'] = None
|
||||
|
||||
try:
|
||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||
if shared.args.lora:
|
||||
add_lora_to_model(shared.args.lora) # list
|
||||
|
||||
except Exception as e:
|
||||
response = json.dumps({'error': { 'message': repr(e) } })
|
||||
|
||||
self.wfile.write(response.encode('utf-8'))
|
||||
raise e
|
||||
|
||||
shared.args.model = shared.model_name
|
||||
|
||||
result = get_model_info()
|
||||
|
||||
elif action == 'unload':
|
||||
unload_model()
|
||||
shared.model_name = None
|
||||
shared.args.model = None
|
||||
result = get_model_info()
|
||||
|
||||
elif action == 'list':
|
||||
result = get_available_models()
|
||||
|
||||
elif action == 'info':
|
||||
result = get_model_info()
|
||||
|
||||
response = json.dumps({
|
||||
'result': result,
|
||||
})
|
||||
|
||||
self.wfile.write(response.encode('utf-8'))
|
||||
|
||||
elif self.path == '/api/v1/token-count':
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
|
@ -56,7 +56,12 @@ class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
|
||||
|
||||
@staticmethod
|
||||
def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return shared.model.model.embed_tokens(input_ids).to(shared.model.device, dtype=shared.model.dtype)
|
||||
if hasattr(shared.model.model, 'embed_tokens'):
|
||||
func = shared.model.model.embed_tokens
|
||||
else:
|
||||
func = shared.model.model.model.embed_tokens # AutoGPTQ case
|
||||
|
||||
return func(input_ids).to(shared.model.device, dtype=shared.model.dtype)
|
||||
|
||||
@staticmethod
|
||||
def placeholder_embeddings() -> torch.Tensor:
|
||||
|
Loading…
Reference in New Issue
Block a user