diff --git a/README.md b/README.md index 6cba1b31..225ed6c4 100644 --- a/README.md +++ b/README.md @@ -90,10 +90,6 @@ cd text-generation-webui pip install -r requirements.txt ``` -#### llama.cpp with GPU acceleration - -Requires the additional compilation step described here: [GPU acceleration](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md#gpu-acceleration). - #### bitsandbytes bitsandbytes >= 0.39 may not work on older NVIDIA GPUs. In that case, to use `--load-in-8bit`, you may have to downgrade like this: diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 94d893c4..e09c1a74 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch -from llama_cpp import Llama from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast @@ -11,6 +10,10 @@ from transformers.modeling_outputs import CausalLMOutputWithPast from modules import shared from modules.logging_colors import logger +if torch.cuda.is_available(): + from llama_cpp_cuda import Llama +else: + from llama_cpp import Llama class LlamacppHF(PreTrainedModel): def __init__(self, model): diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 180b0f37..c6e6ec54 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -9,12 +9,17 @@ https://abetlen.github.io/llama-cpp-python/ import re from functools import partial -from llama_cpp import Llama, LlamaCache, LogitsProcessorList +import torch from modules import shared from modules.callbacks import Iteratorize from modules.logging_colors import logger +if torch.cuda.is_available(): + from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList +else: + from llama_cpp import Llama, LlamaCache, LogitsProcessorList + def ban_eos_logits_processor(eos_token, input_ids, logits): logits[eos_token] = -float('inf') diff --git a/modules/shared.py b/modules/shared.py index da4b99f3..f0a426a0 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -62,7 +62,7 @@ settings = { 'chat_generation_attempts_max': 10, 'default_extensions': [], 'chat_default_extensions': ['gallery'], - 'preset': 'simple-1', + 'preset': 'Divine Intellect', 'prompt': 'QA', } diff --git a/requirements.txt b/requirements.txt index cbcb8320..913a568a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,18 +13,22 @@ Pillow>=9.5.0 pyyaml requests safetensors==0.3.1 -sentencepiece -tqdm scipy +sentencepiece tensorboard -wandb transformers==4.31.* +tqdm +wandb git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 bitsandbytes==0.40.2; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.2-py3-none-win_amd64.whl; platform_system == "Windows" -llama-cpp-python==0.1.73; platform_system != "Windows" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.73/llama_cpp_python-0.1.73-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/exllama/releases/download/0.0.7/exllama-0.0.7+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/exllama/releases/download/0.0.7/exllama-0.0.7+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +# llama-cpp-python without GPU support +llama-cpp-python==0.1.73; platform_system != "Windows" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.73/llama_cpp_python-0.1.73-cp310-cp310-win_amd64.whl; platform_system == "Windows" +# llama-cpp-python with CUDA support +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.73+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.73+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/settings-template.yaml b/settings-template.yaml index ef9a7e7e..de2c73d3 100644 --- a/settings-template.yaml +++ b/settings-template.yaml @@ -36,5 +36,5 @@ chat_generation_attempts_max: 10 default_extensions: [] chat_default_extensions: - gallery -preset: simple-1 +preset: 'Divine Intellect' prompt: QA