mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 17:29:22 +01:00
Print context length / instruction template to terminal when loading models
This commit is contained in:
parent
e05d8fd441
commit
e6f44d6d19
@ -78,12 +78,7 @@ def process_parameters(body, is_legacy=False):
|
|||||||
max_tokens_str = 'length' if is_legacy else 'max_tokens'
|
max_tokens_str = 'length' if is_legacy else 'max_tokens'
|
||||||
generate_params['max_new_tokens'] = body.pop(max_tokens_str)
|
generate_params['max_new_tokens'] = body.pop(max_tokens_str)
|
||||||
if generate_params['truncation_length'] == 0:
|
if generate_params['truncation_length'] == 0:
|
||||||
if shared.args.loader and shared.args.loader.lower().startswith('exllama'):
|
generate_params['truncation_length'] = shared.settings['truncation_length']
|
||||||
generate_params['truncation_length'] = shared.args.max_seq_len
|
|
||||||
elif shared.args.loader and shared.args.loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
|
|
||||||
generate_params['truncation_length'] = shared.args.n_ctx
|
|
||||||
else:
|
|
||||||
generate_params['truncation_length'] = shared.settings['truncation_length']
|
|
||||||
|
|
||||||
if body['preset'] is not None:
|
if body['preset'] is not None:
|
||||||
preset = load_preset_memoized(body['preset'])
|
preset = load_preset_memoized(body['preset'])
|
||||||
|
@ -97,6 +97,13 @@ def load_model(model_name, loader=None):
|
|||||||
llama_attn_hijack.hijack_llama_attention()
|
llama_attn_hijack.hijack_llama_attention()
|
||||||
|
|
||||||
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
||||||
|
if loader.lower().startswith('exllama'):
|
||||||
|
shared.settings['truncation_length'] = shared.args.max_seq_len
|
||||||
|
elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
|
||||||
|
shared.settings['truncation_length'] = shared.args.n_ctx
|
||||||
|
|
||||||
|
logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
|
||||||
|
logger.info(f"INSTRUCTION TEMPLATE: {shared.settings['instruction_template']}")
|
||||||
logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
|
logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
@ -395,6 +402,7 @@ def get_max_memory_dict():
|
|||||||
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
|
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
|
||||||
else:
|
else:
|
||||||
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
||||||
|
|
||||||
suggestion = round((total_mem - 1000) / 1000) * 1000
|
suggestion = round((total_mem - 1000) / 1000) * 1000
|
||||||
if total_mem - suggestion < 800:
|
if total_mem - suggestion < 800:
|
||||||
suggestion -= 1000
|
suggestion -= 1000
|
||||||
|
Loading…
Reference in New Issue
Block a user