From 07de7d0426bbc7b0bfbbe166e51bf711b5338afa Mon Sep 17 00:00:00 2001 From: catalpaaa <89681913+catalpaaa@users.noreply.github.com> Date: Mon, 17 Apr 2023 06:47:26 -0700 Subject: [PATCH] Load llamacpp before quantized model (#1307) --- modules/models.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/models.py b/modules/models.py index 2d3ce2ad..3ec4cd9d 100644 --- a/modules/models.py +++ b/modules/models.py @@ -99,6 +99,16 @@ def load_model(model_name): return model, tokenizer + # llamacpp model + elif shared.is_llamacpp: + from modules.llamacpp_model_alternative import LlamaCppModel + + model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0] + print(f"llama.cpp weights detected: {model_file}\n") + + model, tokenizer = LlamaCppModel.from_pretrained(model_file) + return model, tokenizer + # Quantized model elif shared.args.wbits > 0: @@ -116,16 +126,6 @@ def load_model(model_name): model = load_quantized(model_name) - # llamacpp model - elif shared.is_llamacpp: - from modules.llamacpp_model_alternative import LlamaCppModel - - model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0] - print(f"llama.cpp weights detected: {model_file}\n") - - model, tokenizer = LlamaCppModel.from_pretrained(model_file) - return model, tokenizer - # Custom else: params = {"low_cpu_mem_usage": True}