From f243b4ca9cec7da346cf5d4dd74a16a5cc4515c5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 4 Jul 2024 19:15:37 -0700 Subject: [PATCH] Make llama-cpp-python not crash immediately --- modules/llama_cpp_python_hijack.py | 52 +++++++++++++++++++----------- modules/llamacpp_hf.py | 29 ++--------------- modules/llamacpp_model.py | 29 ++--------------- 3 files changed, 37 insertions(+), 73 deletions(-) diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py index eb23177f..d1e1a342 100644 --- a/modules/llama_cpp_python_hijack.py +++ b/modules/llama_cpp_python_hijack.py @@ -1,3 +1,4 @@ +import importlib from typing import Sequence from tqdm import tqdm @@ -5,20 +6,38 @@ from tqdm import tqdm from modules import shared from modules.cache_utils import process_llamacpp_cache -try: - import llama_cpp -except: - llama_cpp = None -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None +def llama_cpp_lib(): + return_lib = None -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None + if shared.args.cpu: + try: + return_lib = importlib.import_module('llama_cpp') + except: + pass + + if shared.args.tensorcores and return_lib is None: + try: + return_lib = importlib.import_module('llama_cpp_cuda_tensorcores') + except: + pass + + if return_lib is None: + try: + return_lib = importlib.import_module('llama_cpp_cuda') + except: + pass + + if return_lib is None and not shared.args.cpu: + try: + return_lib = importlib.import_module('llama_cpp') + except: + pass + + if return_lib is not None: + monkey_patch_llama_cpp_python(return_lib) + + return return_lib def eval_with_progress(self, tokens: Sequence[int]): @@ -63,7 +82,7 @@ def eval_with_progress(self, tokens: Sequence[int]): self.n_tokens += n_tokens -def monkey_patch_generate(lib): +def monkey_patch_llama_cpp_python(lib): def my_generate(self, *args, **kwargs): @@ -77,11 +96,6 @@ def monkey_patch_generate(lib): for output in self.original_generate(*args, **kwargs): yield output + lib.Llama.eval = eval_with_progress lib.Llama.original_generate = lib.Llama.generate lib.Llama.generate = my_generate - - -for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]: - if lib is not None: - lib.Llama.eval = eval_with_progress - monkey_patch_generate(lib) diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index ed0347d7..327e3a7b 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -7,35 +7,10 @@ from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast -from modules import llama_cpp_python_hijack, shared +from modules import shared +from modules.llama_cpp_python_hijack import llama_cpp_lib from modules.logging_colors import logger -try: - import llama_cpp -except: - llama_cpp = None - -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None - -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None - - -def llama_cpp_lib(): - if shared.args.cpu and llama_cpp is not None: - return llama_cpp - elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None: - return llama_cpp_cuda_tensorcores - elif llama_cpp_cuda is not None: - return llama_cpp_cuda - else: - return llama_cpp - class LlamacppHF(PreTrainedModel): def __init__(self, model, path): diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index fe7c1efe..a16230ca 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -4,37 +4,12 @@ from functools import partial import numpy as np import torch -from modules import llama_cpp_python_hijack, shared +from modules import shared from modules.callbacks import Iteratorize +from modules.llama_cpp_python_hijack import llama_cpp_lib from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length -try: - import llama_cpp -except: - llama_cpp = None - -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None - -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None - - -def llama_cpp_lib(): - if shared.args.cpu and llama_cpp is not None: - return llama_cpp - elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None: - return llama_cpp_cuda_tensorcores - elif llama_cpp_cuda is not None: - return llama_cpp_cuda - else: - return llama_cpp - def ban_eos_logits_processor(eos_token, input_ids, logits): logits[eos_token] = -float('inf')