Merge pull request #3697 from jllllll/llamacpp-ggml

Use separate llama-cpp-python packages for GGML support
This commit is contained in:
oobabooga 2023-08-27 01:51:00 -03:00 committed by GitHub
commit d826bc5d1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 88 additions and 21 deletions

View File

@ -57,7 +57,8 @@ class ModelDownloader:
classifications = [] classifications = []
has_pytorch = False has_pytorch = False
has_pt = False has_pt = False
# has_gguf = False has_gguf = False
has_ggml = False
has_safetensors = False has_safetensors = False
is_lora = False is_lora = False
while True: while True:
@ -79,6 +80,7 @@ class ModelDownloader:
is_safetensors = re.match(r".*\.safetensors", fname) is_safetensors = re.match(r".*\.safetensors", fname)
is_pt = re.match(r".*\.pt", fname) is_pt = re.match(r".*\.pt", fname)
is_gguf = re.match(r'.*\.gguf', fname) is_gguf = re.match(r'.*\.gguf', fname)
is_ggml = re.match(r".*ggml.*\.bin", fname)
is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)): if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
@ -102,8 +104,11 @@ class ModelDownloader:
has_pt = True has_pt = True
classifications.append('pt') classifications.append('pt')
elif is_gguf: elif is_gguf:
# has_gguf = True has_gguf = True
classifications.append('gguf') classifications.append('gguf')
elif is_ggml:
has_ggml = True
classifications.append('ggml')
cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50' cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
cursor = base64.b64encode(cursor) cursor = base64.b64encode(cursor)
@ -115,6 +120,12 @@ class ModelDownloader:
if classifications[i] in ['pytorch', 'pt']: if classifications[i] in ['pytorch', 'pt']:
links.pop(i) links.pop(i)
# If both GGML and GGUF are available, download GGUF only
if has_ggml and has_gguf:
for i in range(len(classifications) - 1, -1, -1):
if classifications[i] == 'ggml':
links.pop(i)
return links, sha256, is_lora return links, sha256, is_lora
def get_output_folder(self, model, branch, is_lora, base_folder=None): def get_output_folder(self, model, branch, is_lora, base_folder=None):

View File

@ -9,23 +9,38 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import RoPE, shared from modules import RoPE, shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.utils import is_gguf
import llama_cpp import llama_cpp
try:
import llama_cpp_ggml
except:
llama_cpp_ggml = llama_cpp
if torch.cuda.is_available() and not torch.version.hip: if torch.cuda.is_available() and not torch.version.hip:
try: try:
import llama_cpp_cuda import llama_cpp_cuda
except: except:
llama_cpp_cuda = None llama_cpp_cuda = None
try:
import llama_cpp_ggml_cuda
except:
llama_cpp_ggml_cuda = llama_cpp_cuda
else: else:
llama_cpp_cuda = None llama_cpp_cuda = None
llama_cpp_ggml_cuda = None
def llama_cpp_lib(): def llama_cpp_lib(model_file: Union[str, Path] = None):
if shared.args.cpu or llama_cpp_cuda is None: if model_file is not None:
return llama_cpp gguf_model = is_gguf(model_file)
else: else:
return llama_cpp_cuda gguf_model = True
if shared.args.cpu or llama_cpp_cuda is None:
return llama_cpp if gguf_model else llama_cpp_ggml
else:
return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
class LlamacppHF(PreTrainedModel): class LlamacppHF(PreTrainedModel):
@ -165,7 +180,7 @@ class LlamacppHF(PreTrainedModel):
if path.is_file(): if path.is_file():
model_file = path model_file = path
else: else:
model_file = list(path.glob('*.gguf*'))[0] model_file = (list(path.glob('*.gguf*')) + list(path.glob('*ggml*.bin')))[0]
logger.info(f"llama.cpp weights detected: {model_file}\n") logger.info(f"llama.cpp weights detected: {model_file}\n")
@ -188,12 +203,17 @@ class LlamacppHF(PreTrainedModel):
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
'tensor_split': tensor_split_list, 'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
'n_gqa': shared.args.n_gqa or None,
'rms_norm_eps': shared.args.rms_norm_eps or None,
'logits_all': True, 'logits_all': True,
} }
Llama = llama_cpp_lib().Llama if not is_gguf(model_file):
ggml_params = {
'n_gqa': shared.args.n_gqa or None,
'rms_norm_eps': shared.args.rms_norm_eps or None,
}
params = params | ggml_params
Llama = llama_cpp_lib(model_file).Llama
model = Llama(**params) model = Llama(**params)
return LlamacppHF(model) return LlamacppHF(model)

View File

@ -1,5 +1,7 @@
import re import re
from functools import partial from functools import partial
from pathlib import Path
from typing import Union
import torch import torch
@ -7,23 +9,38 @@ from modules import RoPE, shared
from modules.callbacks import Iteratorize from modules.callbacks import Iteratorize
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length from modules.text_generation import get_max_prompt_length
from modules.utils import is_gguf
import llama_cpp import llama_cpp
try:
import llama_cpp_ggml
except:
llama_cpp_ggml = llama_cpp
if torch.cuda.is_available() and not torch.version.hip: if torch.cuda.is_available() and not torch.version.hip:
try: try:
import llama_cpp_cuda import llama_cpp_cuda
except: except:
llama_cpp_cuda = None llama_cpp_cuda = None
try:
import llama_cpp_ggml_cuda
except:
llama_cpp_ggml_cuda = llama_cpp_cuda
else: else:
llama_cpp_cuda = None llama_cpp_cuda = None
llama_cpp_ggml_cuda = None
def llama_cpp_lib(): def llama_cpp_lib(model_file: Union[str, Path] = None):
if shared.args.cpu or llama_cpp_cuda is None: if model_file is not None:
return llama_cpp gguf_model = is_gguf(model_file)
else: else:
return llama_cpp_cuda gguf_model = True
if shared.args.cpu or llama_cpp_cuda is None:
return llama_cpp if gguf_model else llama_cpp_ggml
else:
return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
def ban_eos_logits_processor(eos_token, input_ids, logits): def ban_eos_logits_processor(eos_token, input_ids, logits):
@ -41,8 +58,8 @@ class LlamaCppModel:
@classmethod @classmethod
def from_pretrained(self, path): def from_pretrained(self, path):
Llama = llama_cpp_lib().Llama Llama = llama_cpp_lib(str(path)).Llama
LlamaCache = llama_cpp_lib().LlamaCache LlamaCache = llama_cpp_lib(str(path)).LlamaCache
result = self() result = self()
cache_capacity = 0 cache_capacity = 0
@ -75,9 +92,14 @@ class LlamaCppModel:
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
'tensor_split': tensor_split_list, 'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
}
if not is_gguf(str(path)):
ggml_params = {
'n_gqa': shared.args.n_gqa or None, 'n_gqa': shared.args.n_gqa or None,
'rms_norm_eps': shared.args.rms_norm_eps or None, 'rms_norm_eps': shared.args.rms_norm_eps or None,
} }
params = params | ggml_params
result.model = Llama(**params) result.model = Llama(**params)
if cache_capacity > 0: if cache_capacity > 0:

View File

@ -241,7 +241,7 @@ def llamacpp_loader(model_name):
if path.is_file(): if path.is_file():
model_file = path model_file = path
else: else:
model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*'))[0] model_file = (list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*')) + list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin')))[0]
logger.info(f"llama.cpp weights detected: {model_file}") logger.info(f"llama.cpp weights detected: {model_file}")
model, tokenizer = LlamaCppModel.from_pretrained(model_file) model, tokenizer = LlamaCppModel.from_pretrained(model_file)

View File

@ -24,9 +24,9 @@ def infer_loader(model_name):
loader = None loader = None
elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0): elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
loader = 'AutoGPTQ' loader = 'AutoGPTQ'
elif len(list(path_to_model.glob('*.gguf*'))) > 0: elif len(list(path_to_model.glob('*.gguf*')) + list(path_to_model.glob('*ggml*.bin'))) > 0:
loader = 'llama.cpp' loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()): elif re.match(r'.*\.gguf|.*ggml.*\.bin', model_name.lower()):
loader = 'llama.cpp' loader = 'llama.cpp'
elif re.match(r'.*rwkv.*\.pth', model_name.lower()): elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV' loader = 'RWKV'

View File

@ -2,6 +2,7 @@ import os
import re import re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Union
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
@ -124,3 +125,11 @@ def get_datasets(path: str, ext: str):
def get_available_chat_styles(): def get_available_chat_styles():
return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys) return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
# Determines if a llama.cpp model is in GGUF format
# Copied from ctransformers utils.py
def is_gguf(path: Union[str, Path]) -> bool:
path = str(Path(path).resolve())
with open(path, "rb") as f:
magic = f.read(4)
return magic == "GGUF".encode()

View File

@ -35,6 +35,11 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.79/llama_cpp_
# llama-cpp-python with CUDA support # llama-cpp-python with CUDA support
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.79+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
# llama-cpp-python with GGML support
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
# GPTQ-for-LLaMa # GPTQ-for-LLaMa
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"