mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Add context_instruct to API. Load default model instruction template … (#2688)
This commit is contained in:
parent
324e45b848
commit
d9fabdde40
@ -23,7 +23,8 @@ async def run(user_input, history):
|
|||||||
'history': history,
|
'history': history,
|
||||||
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
|
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
|
||||||
'character': 'Example',
|
'character': 'Example',
|
||||||
'instruction_template': 'Vicuna-v1.1',
|
'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset
|
||||||
|
# 'context_instruct': '', # Optional
|
||||||
'your_name': 'You',
|
'your_name': 'You',
|
||||||
|
|
||||||
'regenerate': False,
|
'regenerate': False,
|
||||||
@ -34,7 +35,7 @@ async def run(user_input, history):
|
|||||||
|
|
||||||
# Generation params. If 'preset' is set to different than 'None', the values
|
# Generation params. If 'preset' is set to different than 'None', the values
|
||||||
# in presets/preset-name.yaml are used instead of the individual numbers.
|
# in presets/preset-name.yaml are used instead of the individual numbers.
|
||||||
'preset': 'None',
|
'preset': 'None',
|
||||||
'do_sample': True,
|
'do_sample': True,
|
||||||
'temperature': 0.7,
|
'temperature': 0.7,
|
||||||
'top_p': 0.1,
|
'top_p': 0.1,
|
||||||
|
@ -17,7 +17,8 @@ def run(user_input, history):
|
|||||||
'history': history,
|
'history': history,
|
||||||
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
|
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
|
||||||
'character': 'Example',
|
'character': 'Example',
|
||||||
'instruction_template': 'Vicuna-v1.1',
|
'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset
|
||||||
|
# 'context_instruct': '', # Optional
|
||||||
'your_name': 'You',
|
'your_name': 'You',
|
||||||
|
|
||||||
'regenerate': False,
|
'regenerate': False,
|
||||||
|
@ -4,8 +4,9 @@ import requests
|
|||||||
|
|
||||||
HOST = '0.0.0.0:5000'
|
HOST = '0.0.0.0:5000'
|
||||||
|
|
||||||
def generate(prompt, tokens = 200):
|
|
||||||
request = { 'prompt': prompt, 'max_new_tokens': tokens }
|
def generate(prompt, tokens=200):
|
||||||
|
request = {'prompt': prompt, 'max_new_tokens': tokens}
|
||||||
response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
|
response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@ -23,7 +24,7 @@ def print_basic_model_info(response):
|
|||||||
print("Model: ", response['result']['model_name'])
|
print("Model: ", response['result']['model_name'])
|
||||||
print("Lora(s): ", response['result']['lora_names'])
|
print("Lora(s): ", response['result']['lora_names'])
|
||||||
for setting in basic_settings:
|
for setting in basic_settings:
|
||||||
print(setting, "=", response['result']['shared.settings'][setting])
|
print(setting, "=", response['result']['shared.settings'][setting])
|
||||||
|
|
||||||
|
|
||||||
# model info
|
# model info
|
||||||
@ -75,17 +76,17 @@ def complex_model_load(model):
|
|||||||
'rwkv_cuda_on': False,
|
'rwkv_cuda_on': False,
|
||||||
|
|
||||||
# b&b 4-bit
|
# b&b 4-bit
|
||||||
#'load_in_4bit': False,
|
# 'load_in_4bit': False,
|
||||||
#'compute_dtype': 'float16',
|
# 'compute_dtype': 'float16',
|
||||||
#'quant_type': 'nf4',
|
# 'quant_type': 'nf4',
|
||||||
#'use_double_quant': False,
|
# 'use_double_quant': False,
|
||||||
|
|
||||||
#"cpu": false,
|
# "cpu": false,
|
||||||
#"auto_devices": false,
|
# "auto_devices": false,
|
||||||
#"gpu_memory": null,
|
# "gpu_memory": null,
|
||||||
#"cpu_memory": null,
|
# "cpu_memory": null,
|
||||||
#"disk": false,
|
# "disk": false,
|
||||||
#"disk_cache_dir": "cache",
|
# "disk_cache_dir": "cache",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,26 +105,25 @@ def complex_model_load(model):
|
|||||||
req['args']['load_in_8bit'] = True
|
req['args']['load_in_8bit'] = True
|
||||||
elif '-hf' in model or 'fp16' in model:
|
elif '-hf' in model or 'fp16' in model:
|
||||||
if '7b' in model:
|
if '7b' in model:
|
||||||
req['args']['bf16'] = True # for 24GB
|
req['args']['bf16'] = True # for 24GB
|
||||||
elif '13b' in model:
|
elif '13b' in model:
|
||||||
req['args']['load_in_8bit'] = True # for 24GB
|
req['args']['load_in_8bit'] = True # for 24GB
|
||||||
elif 'ggml' in model:
|
elif 'ggml' in model:
|
||||||
#req['args']['threads'] = 16
|
# req['args']['threads'] = 16
|
||||||
if '7b' in model:
|
if '7b' in model:
|
||||||
req['args']['n_gpu_layers'] = 100
|
req['args']['n_gpu_layers'] = 100
|
||||||
elif '13b' in model:
|
elif '13b' in model:
|
||||||
req['args']['n_gpu_layers'] = 100
|
req['args']['n_gpu_layers'] = 100
|
||||||
elif '30b' in model or '33b' in model:
|
elif '30b' in model or '33b' in model:
|
||||||
req['args']['n_gpu_layers'] = 59 # 24GB
|
req['args']['n_gpu_layers'] = 59 # 24GB
|
||||||
elif '65b' in model:
|
elif '65b' in model:
|
||||||
req['args']['n_gpu_layers'] = 42 # 24GB
|
req['args']['n_gpu_layers'] = 42 # 24GB
|
||||||
elif 'rwkv' in model:
|
elif 'rwkv' in model:
|
||||||
req['args']['rwkv_cuda_on'] = True
|
req['args']['rwkv_cuda_on'] = True
|
||||||
if '14b' in model:
|
if '14b' in model:
|
||||||
req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
|
req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
|
||||||
else:
|
else:
|
||||||
req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
|
req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
|
||||||
|
|
||||||
|
|
||||||
return model_api(req)
|
return model_api(req)
|
||||||
|
|
||||||
@ -134,7 +134,7 @@ if __name__ == '__main__':
|
|||||||
resp = complex_model_load(model)
|
resp = complex_model_load(model)
|
||||||
|
|
||||||
if 'error' in resp:
|
if 'error' in resp:
|
||||||
print (f"❌ {model} FAIL Error: {resp['error']['message']}")
|
print(f"❌ {model} FAIL Error: {resp['error']['message']}")
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print_basic_model_info(resp)
|
print_basic_model_info(resp)
|
||||||
@ -142,12 +142,12 @@ if __name__ == '__main__':
|
|||||||
ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
|
ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
|
||||||
|
|
||||||
if '21' in ans:
|
if '21' in ans:
|
||||||
print (f"✅ {model} PASS ({ans})")
|
print(f"✅ {model} PASS ({ans})")
|
||||||
else:
|
else:
|
||||||
print (f"❌ {model} FAIL ({ans})")
|
print(f"❌ {model} FAIL ({ans})")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print (f"❌ {model} FAIL Exception: {repr(e)}")
|
print(f"❌ {model} FAIL Exception: {repr(e)}")
|
||||||
|
|
||||||
|
|
||||||
# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
|
# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
|
||||||
|
@ -23,7 +23,7 @@ async def run(context):
|
|||||||
|
|
||||||
# Generation params. If 'preset' is set to different than 'None', the values
|
# Generation params. If 'preset' is set to different than 'None', the values
|
||||||
# in presets/preset-name.yaml are used instead of the individual numbers.
|
# in presets/preset-name.yaml are used instead of the individual numbers.
|
||||||
'preset': 'None',
|
'preset': 'None',
|
||||||
'do_sample': True,
|
'do_sample': True,
|
||||||
'temperature': 0.7,
|
'temperature': 0.7,
|
||||||
'top_p': 0.1,
|
'top_p': 0.1,
|
||||||
|
@ -15,7 +15,7 @@ def run(prompt):
|
|||||||
|
|
||||||
# Generation params. If 'preset' is set to different than 'None', the values
|
# Generation params. If 'preset' is set to different than 'None', the values
|
||||||
# in presets/preset-name.yaml are used instead of the individual numbers.
|
# in presets/preset-name.yaml are used instead of the individual numbers.
|
||||||
'preset': 'None',
|
'preset': 'None',
|
||||||
'do_sample': True,
|
'do_sample': True,
|
||||||
'temperature': 0.7,
|
'temperature': 0.7,
|
||||||
'top_p': 0.1,
|
'top_p': 0.1,
|
||||||
|
@ -59,7 +59,10 @@ def build_parameters(body, chat=False):
|
|||||||
|
|
||||||
if chat:
|
if chat:
|
||||||
character = body.get('character')
|
character = body.get('character')
|
||||||
instruction_template = body.get('instruction_template')
|
instruction_template = body.get('instruction_template', shared.settings['instruction_template'])
|
||||||
|
if str(instruction_template) == "None":
|
||||||
|
instruction_template = "Vicuna-v1.1"
|
||||||
|
|
||||||
name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
|
name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
|
||||||
name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
|
name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
|
||||||
generate_params.update({
|
generate_params.update({
|
||||||
@ -72,7 +75,7 @@ def build_parameters(body, chat=False):
|
|||||||
'greeting': greeting,
|
'greeting': greeting,
|
||||||
'name1_instruct': name1_instruct,
|
'name1_instruct': name1_instruct,
|
||||||
'name2_instruct': name2_instruct,
|
'name2_instruct': name2_instruct,
|
||||||
'context_instruct': context_instruct,
|
'context_instruct': body.get('context_instruct', context_instruct),
|
||||||
'turn_template': turn_template,
|
'turn_template': turn_template,
|
||||||
'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
|
'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
|
||||||
'history': body.get('history', {'internal': [], 'visible': []})
|
'history': body.get('history', {'internal': [], 'visible': []})
|
||||||
|
Loading…
Reference in New Issue
Block a user