From d9fabdde40af766837d3cc7d0758189ab0f6ea8d Mon Sep 17 00:00:00 2001 From: atriantafy Date: Wed, 12 Jul 2023 04:01:03 +0100 Subject: [PATCH] =?UTF-8?q?Add=20context=5Finstruct=20to=20API.=20Load=20d?= =?UTF-8?q?efault=20model=20instruction=20template=20=E2=80=A6=20(#2688)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api-examples/api-example-chat-stream.py | 5 ++- api-examples/api-example-chat.py | 3 +- api-examples/api-example-model.py | 50 ++++++++++++------------- api-examples/api-example-stream.py | 2 +- api-examples/api-example.py | 2 +- extensions/api/util.py | 7 +++- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py index 8e37b569..14f6f9d6 100644 --- a/api-examples/api-example-chat-stream.py +++ b/api-examples/api-example-chat-stream.py @@ -23,7 +23,8 @@ async def run(user_input, history): 'history': history, 'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct' 'character': 'Example', - 'instruction_template': 'Vicuna-v1.1', + 'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset + # 'context_instruct': '', # Optional 'your_name': 'You', 'regenerate': False, @@ -34,7 +35,7 @@ async def run(user_input, history): # Generation params. If 'preset' is set to different than 'None', the values # in presets/preset-name.yaml are used instead of the individual numbers. - 'preset': 'None', + 'preset': 'None', 'do_sample': True, 'temperature': 0.7, 'top_p': 0.1, diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py index 23f2f186..0e155c63 100644 --- a/api-examples/api-example-chat.py +++ b/api-examples/api-example-chat.py @@ -17,7 +17,8 @@ def run(user_input, history): 'history': history, 'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct' 'character': 'Example', - 'instruction_template': 'Vicuna-v1.1', + 'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset + # 'context_instruct': '', # Optional 'your_name': 'You', 'regenerate': False, diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py index 1e108a2d..9a61ccb1 100644 --- a/api-examples/api-example-model.py +++ b/api-examples/api-example-model.py @@ -4,8 +4,9 @@ import requests HOST = '0.0.0.0:5000' -def generate(prompt, tokens = 200): - request = { 'prompt': prompt, 'max_new_tokens': tokens } + +def generate(prompt, tokens=200): + request = {'prompt': prompt, 'max_new_tokens': tokens} response = requests.post(f'http://{HOST}/api/v1/generate', json=request) if response.status_code == 200: @@ -23,7 +24,7 @@ def print_basic_model_info(response): print("Model: ", response['result']['model_name']) print("Lora(s): ", response['result']['lora_names']) for setting in basic_settings: - print(setting, "=", response['result']['shared.settings'][setting]) + print(setting, "=", response['result']['shared.settings'][setting]) # model info @@ -75,17 +76,17 @@ def complex_model_load(model): 'rwkv_cuda_on': False, # b&b 4-bit - #'load_in_4bit': False, - #'compute_dtype': 'float16', - #'quant_type': 'nf4', - #'use_double_quant': False, + # 'load_in_4bit': False, + # 'compute_dtype': 'float16', + # 'quant_type': 'nf4', + # 'use_double_quant': False, - #"cpu": false, - #"auto_devices": false, - #"gpu_memory": null, - #"cpu_memory": null, - #"disk": false, - #"disk_cache_dir": "cache", + # "cpu": false, + # "auto_devices": false, + # "gpu_memory": null, + # "cpu_memory": null, + # "disk": false, + # "disk_cache_dir": "cache", }, } @@ -104,26 +105,25 @@ def complex_model_load(model): req['args']['load_in_8bit'] = True elif '-hf' in model or 'fp16' in model: if '7b' in model: - req['args']['bf16'] = True # for 24GB + req['args']['bf16'] = True # for 24GB elif '13b' in model: - req['args']['load_in_8bit'] = True # for 24GB + req['args']['load_in_8bit'] = True # for 24GB elif 'ggml' in model: - #req['args']['threads'] = 16 + # req['args']['threads'] = 16 if '7b' in model: req['args']['n_gpu_layers'] = 100 elif '13b' in model: req['args']['n_gpu_layers'] = 100 elif '30b' in model or '33b' in model: - req['args']['n_gpu_layers'] = 59 # 24GB + req['args']['n_gpu_layers'] = 59 # 24GB elif '65b' in model: - req['args']['n_gpu_layers'] = 42 # 24GB + req['args']['n_gpu_layers'] = 42 # 24GB elif 'rwkv' in model: req['args']['rwkv_cuda_on'] = True if '14b' in model: - req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB + req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB else: - req['args']['rwkv_strategy'] = 'cuda f16' # 24GB - + req['args']['rwkv_strategy'] = 'cuda f16' # 24GB return model_api(req) @@ -134,7 +134,7 @@ if __name__ == '__main__': resp = complex_model_load(model) if 'error' in resp: - print (f"❌ {model} FAIL Error: {resp['error']['message']}") + print(f"❌ {model} FAIL Error: {resp['error']['message']}") continue else: print_basic_model_info(resp) @@ -142,12 +142,12 @@ if __name__ == '__main__': ans = generate("0,1,1,2,3,5,8,13,", tokens=2) if '21' in ans: - print (f"✅ {model} PASS ({ans})") + print(f"✅ {model} PASS ({ans})") else: - print (f"❌ {model} FAIL ({ans})") + print(f"❌ {model} FAIL ({ans})") except Exception as e: - print (f"❌ {model} FAIL Exception: {repr(e)}") + print(f"❌ {model} FAIL Exception: {repr(e)}") # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21. diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py index 79a01e4d..1ae5a91c 100644 --- a/api-examples/api-example-stream.py +++ b/api-examples/api-example-stream.py @@ -23,7 +23,7 @@ async def run(context): # Generation params. If 'preset' is set to different than 'None', the values # in presets/preset-name.yaml are used instead of the individual numbers. - 'preset': 'None', + 'preset': 'None', 'do_sample': True, 'temperature': 0.7, 'top_p': 0.1, diff --git a/api-examples/api-example.py b/api-examples/api-example.py index b09823c3..4e45de9e 100644 --- a/api-examples/api-example.py +++ b/api-examples/api-example.py @@ -15,7 +15,7 @@ def run(prompt): # Generation params. If 'preset' is set to different than 'None', the values # in presets/preset-name.yaml are used instead of the individual numbers. - 'preset': 'None', + 'preset': 'None', 'do_sample': True, 'temperature': 0.7, 'top_p': 0.1, diff --git a/extensions/api/util.py b/extensions/api/util.py index a89365ce..a25c7885 100644 --- a/extensions/api/util.py +++ b/extensions/api/util.py @@ -59,7 +59,10 @@ def build_parameters(body, chat=False): if chat: character = body.get('character') - instruction_template = body.get('instruction_template') + instruction_template = body.get('instruction_template', shared.settings['instruction_template']) + if str(instruction_template) == "None": + instruction_template = "Vicuna-v1.1" + name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False) name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True) generate_params.update({ @@ -72,7 +75,7 @@ def build_parameters(body, chat=False): 'greeting': greeting, 'name1_instruct': name1_instruct, 'name2_instruct': name2_instruct, - 'context_instruct': context_instruct, + 'context_instruct': body.get('context_instruct', context_instruct), 'turn_template': turn_template, 'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])), 'history': body.get('history', {'internal': [], 'visible': []})