diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 40a4cab6..6fd533b0 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -78,12 +78,7 @@ def process_parameters(body, is_legacy=False): max_tokens_str = 'length' if is_legacy else 'max_tokens' generate_params['max_new_tokens'] = body.pop(max_tokens_str) if generate_params['truncation_length'] == 0: - if shared.args.loader and shared.args.loader.lower().startswith('exllama'): - generate_params['truncation_length'] = shared.args.max_seq_len - elif shared.args.loader and shared.args.loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: - generate_params['truncation_length'] = shared.args.n_ctx - else: - generate_params['truncation_length'] = shared.settings['truncation_length'] + generate_params['truncation_length'] = shared.settings['truncation_length'] if body['preset'] is not None: preset = load_preset_memoized(body['preset']) diff --git a/modules/models.py b/modules/models.py index e58d5770..70e14361 100644 --- a/modules/models.py +++ b/modules/models.py @@ -97,6 +97,13 @@ def load_model(model_name, loader=None): llama_attn_hijack.hijack_llama_attention() shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) + if loader.lower().startswith('exllama'): + shared.settings['truncation_length'] = shared.args.max_seq_len + elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: + shared.settings['truncation_length'] = shared.args.n_ctx + + logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}") + logger.info(f"INSTRUCTION TEMPLATE: {shared.settings['instruction_template']}") logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.") return model, tokenizer @@ -395,6 +402,7 @@ def get_max_memory_dict(): total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024)) else: total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024)) + suggestion = round((total_mem - 1000) / 1000) * 1000 if total_mem - suggestion < 800: suggestion -= 1000