diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py deleted file mode 100644 index 44109d36..00000000 --- a/api-examples/api-example-model.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 - -import requests - -HOST = '0.0.0.0:5000' - - -def generate(prompt, tokens=200): - request = {'prompt': prompt, 'max_new_tokens': tokens} - response = requests.post(f'http://{HOST}/api/v1/generate', json=request) - - if response.status_code == 200: - return response.json()['results'][0]['text'] - - -def model_api(request): - response = requests.post(f'http://{HOST}/api/v1/model', json=request) - return response.json() - - -# print some common settings -def print_basic_model_info(response): - basic_settings = ['truncation_length', 'instruction_template'] - print("Model: ", response['result']['model_name']) - print("Lora(s): ", response['result']['lora_names']) - for setting in basic_settings: - print(setting, "=", response['result']['shared.settings'][setting]) - - -# model info -def model_info(): - response = model_api({'action': 'info'}) - print_basic_model_info(response) - - -# simple loader -def model_load(model_name): - return model_api({'action': 'load', 'model_name': model_name}) - - -# complex loader -def complex_model_load(model): - - def guess_groupsize(model_name): - if '1024g' in model_name: - return 1024 - elif '128g' in model_name: - return 128 - elif '32g' in model_name: - return 32 - else: - return -1 - - req = { - 'action': 'load', - 'model_name': model, - 'args': { - 'loader': 'AutoGPTQ', - - 'bf16': False, - 'load_in_8bit': False, - 'groupsize': 0, - 'wbits': 0, - - # llama.cpp - 'threads': 0, - 'n_batch': 512, - 'no_mmap': False, - 'mlock': False, - 'cache_capacity': None, - 'n_gpu_layers': 0, - 'n_ctx': 2048, - - # RWKV - 'rwkv_strategy': None, - 'rwkv_cuda_on': False, - - # b&b 4-bit - # 'load_in_4bit': False, - # 'compute_dtype': 'float16', - # 'quant_type': 'nf4', - # 'use_double_quant': False, - - # "cpu": false, - # "auto_devices": false, - # "gpu_memory": null, - # "cpu_memory": null, - # "disk": false, - # "disk_cache_dir": "cache", - }, - } - - model = model.lower() - - if '4bit' in model or 'gptq' in model or 'int4' in model: - req['args']['wbits'] = 4 - req['args']['groupsize'] = guess_groupsize(model) - elif '3bit' in model: - req['args']['wbits'] = 3 - req['args']['groupsize'] = guess_groupsize(model) - else: - req['args']['gptq_for_llama'] = False - - if '8bit' in model: - req['args']['load_in_8bit'] = True - elif '-hf' in model or 'fp16' in model: - if '7b' in model: - req['args']['bf16'] = True # for 24GB - elif '13b' in model: - req['args']['load_in_8bit'] = True # for 24GB - elif 'gguf' in model: - # req['args']['threads'] = 16 - if '7b' in model: - req['args']['n_gpu_layers'] = 100 - elif '13b' in model: - req['args']['n_gpu_layers'] = 100 - elif '30b' in model or '33b' in model: - req['args']['n_gpu_layers'] = 59 # 24GB - elif '65b' in model: - req['args']['n_gpu_layers'] = 42 # 24GB - elif 'rwkv' in model: - req['args']['rwkv_cuda_on'] = True - if '14b' in model: - req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB - else: - req['args']['rwkv_strategy'] = 'cuda f16' # 24GB - - return model_api(req) - - -if __name__ == '__main__': - for model in model_api({'action': 'list'})['result']: - try: - resp = complex_model_load(model) - - if 'error' in resp: - print(f"❌ {model} FAIL Error: {resp['error']['message']}") - continue - else: - print_basic_model_info(resp) - - ans = generate("0,1,1,2,3,5,8,13,", tokens=2) - - if '21' in ans: - print(f"✅ {model} PASS ({ans})") - else: - print(f"❌ {model} FAIL ({ans})") - - except Exception as e: - print(f"❌ {model} FAIL Exception: {repr(e)}") - - -# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21. -# Some results below. -""" $ ./model-api-example.py -Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda -Lora(s): [] -truncation_length = 2048 -instruction_template = Alpaca -✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21) -Model: 4bit_WizardLM-13B-Uncensored-4bit-128g -Lora(s): [] -truncation_length = 2048 -instruction_template = WizardLM -✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21) -Model: Aeala_VicUnlocked-alpaca-30b-4bit -Lora(s): [] -truncation_length = 2048 -instruction_template = Alpaca -✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21) -Model: alpaca-30b-4bit -Lora(s): [] -truncation_length = 2048 -instruction_template = Alpaca -✅ alpaca-30b-4bit PASS (21) -"""