mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
Add 4-bit LoRA support (#1200)
This commit is contained in:
parent
ec3e869c27
commit
39099663a0
@ -237,6 +237,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--groupsize GROUPSIZE` | GPTQ: Group size. |
|
| `--groupsize GROUPSIZE` | GPTQ: Group size. |
|
||||||
| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. |
|
| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. |
|
||||||
| `--no-warmup_autotune` | GPTQ: Disable warmup autotune for triton. |
|
| `--no-warmup_autotune` | GPTQ: Disable warmup autotune for triton. |
|
||||||
|
| `--monkey-patch` | GPTQ: Apply the monkey patch for using LoRAs with quantized models. |
|
||||||
|
|
||||||
#### FlexGen
|
#### FlexGen
|
||||||
|
|
||||||
|
@ -16,6 +16,8 @@ from modelutils import find_layers
|
|||||||
from quant import make_quant
|
from quant import make_quant
|
||||||
|
|
||||||
|
|
||||||
|
# This function is a replacement for the load_quant function in the
|
||||||
|
# GPTQ-for_LLaMa repository. It supports more models and branches.
|
||||||
def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
|
def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
|
||||||
|
|
||||||
def noop(*args, **kwargs):
|
def noop(*args, **kwargs):
|
||||||
@ -64,6 +66,7 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from quant import autotune_warmup, make_quant_attn
|
from quant import autotune_warmup, make_quant_attn
|
||||||
|
|
||||||
# triton branch
|
# triton branch
|
||||||
make_quant_attn(model)
|
make_quant_attn(model)
|
||||||
if not shared.args.no_warmup_autotune:
|
if not shared.args.no_warmup_autotune:
|
||||||
@ -77,6 +80,41 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
# Used to locate the .pt/.safetensors quantized file
|
||||||
|
def find_quantized_model_file(model_name):
|
||||||
|
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
|
pt_path = None
|
||||||
|
priority_name_list = [
|
||||||
|
Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
|
||||||
|
for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
|
||||||
|
for ext in ['.safetensors', '.pt']
|
||||||
|
for hyphen in ['-', f'/{model_name}-', '/']
|
||||||
|
]
|
||||||
|
for path in priority_name_list:
|
||||||
|
if path.exists():
|
||||||
|
pt_path = path
|
||||||
|
break
|
||||||
|
|
||||||
|
# If the model hasn't been found with a well-behaved name, pick the last .pt
|
||||||
|
# or the last .safetensors found in its folder as a last resort
|
||||||
|
if not pt_path:
|
||||||
|
found_pts = list(path_to_model.glob("*.pt"))
|
||||||
|
found_safetensors = list(path_to_model.glob("*.safetensors"))
|
||||||
|
pt_path = None
|
||||||
|
|
||||||
|
if len(found_pts) > 0:
|
||||||
|
if len(found_pts) > 1:
|
||||||
|
print('Warning: more than one .pt model has been found. The last one will be selected. It could be wrong.')
|
||||||
|
pt_path = found_pts[-1]
|
||||||
|
elif len(found_safetensors) > 0:
|
||||||
|
if len(found_pts) > 1:
|
||||||
|
print('Warning: more than one .safetensors model has been found. The last one will be selected. It could be wrong.')
|
||||||
|
pt_path = found_safetensors[-1]
|
||||||
|
|
||||||
|
return pt_path
|
||||||
|
|
||||||
|
|
||||||
|
# The function that loads the model in modules/models.py
|
||||||
def load_quantized(model_name):
|
def load_quantized(model_name):
|
||||||
|
|
||||||
# Find the model type
|
# Find the model type
|
||||||
@ -106,37 +144,9 @@ def load_quantized(model_name):
|
|||||||
print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
|
print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# Locate the quantized model file
|
# Find the quantized model weights file (.pt/.safetensors)
|
||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
pt_path = None
|
pt_path = find_quantized_model_file(model_name)
|
||||||
priority_name_list = [
|
|
||||||
Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
|
|
||||||
for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
|
|
||||||
for ext in ['.safetensors', '.pt']
|
|
||||||
for hyphen in ['-', f'/{model_name}-', '/']
|
|
||||||
]
|
|
||||||
for path in priority_name_list:
|
|
||||||
if path.exists():
|
|
||||||
pt_path = path
|
|
||||||
break
|
|
||||||
|
|
||||||
# If the model hasn't been found with a well-behaved name, pick the last .pt
|
|
||||||
# or the last .safetensors found in its folder as a last resort
|
|
||||||
if not pt_path:
|
|
||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
|
||||||
found_pts = list(path_to_model.glob("*.pt"))
|
|
||||||
found_safetensors = list(path_to_model.glob("*.safetensors"))
|
|
||||||
pt_path = None
|
|
||||||
|
|
||||||
if len(found_pts) > 0:
|
|
||||||
if len(found_pts) > 1:
|
|
||||||
print('Warning: more than one .pt model has been found. The last one will be selected. It could be wrong.')
|
|
||||||
pt_path = found_pts[-1]
|
|
||||||
elif len(found_safetensors) > 0:
|
|
||||||
if len(found_pts) > 1:
|
|
||||||
print('Warning: more than one .safetensors model has been found. The last one will be selected. It could be wrong.')
|
|
||||||
pt_path = found_safetensors[-1]
|
|
||||||
|
|
||||||
if not pt_path:
|
if not pt_path:
|
||||||
print("Could not find the quantized model in .pt or .safetensors format, exiting...")
|
print("Could not find the quantized model in .pt or .safetensors format, exiting...")
|
||||||
exit()
|
exit()
|
||||||
|
@ -43,7 +43,8 @@ def add_lora_to_model(lora_names):
|
|||||||
shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
|
shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
|
||||||
|
|
||||||
if not shared.args.load_in_8bit and not shared.args.cpu:
|
if not shared.args.load_in_8bit and not shared.args.cpu:
|
||||||
shared.model.half()
|
if not shared.args.monkey_patch:
|
||||||
|
shared.model.half()
|
||||||
if not hasattr(shared.model, "hf_device_map"):
|
if not hasattr(shared.model, "hf_device_map"):
|
||||||
if torch.has_mps:
|
if torch.has_mps:
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
|
@ -101,9 +101,20 @@ def load_model(model_name):
|
|||||||
|
|
||||||
# Quantized model
|
# Quantized model
|
||||||
elif shared.args.wbits > 0:
|
elif shared.args.wbits > 0:
|
||||||
from modules.GPTQ_loader import load_quantized
|
|
||||||
|
|
||||||
model = load_quantized(model_name)
|
# Monkey patch
|
||||||
|
if shared.args.monkey_patch:
|
||||||
|
print("Warning: applying the monkey patch for using LoRAs in 4-bit mode.\nIt may cause undefined behavior outside its intended scope.")
|
||||||
|
from modules.monkey_patch_gptq_lora import load_model_llama
|
||||||
|
|
||||||
|
model, tokenizer = load_model_llama(model_name)
|
||||||
|
return model, tokenizer
|
||||||
|
|
||||||
|
# No monkey patch
|
||||||
|
else:
|
||||||
|
from modules.GPTQ_loader import load_quantized
|
||||||
|
|
||||||
|
model = load_quantized(model_name)
|
||||||
|
|
||||||
# llamacpp model
|
# llamacpp model
|
||||||
elif shared.is_llamacpp:
|
elif shared.is_llamacpp:
|
||||||
|
41
modules/monkey_patch_gptq_lora.py
Normal file
41
modules/monkey_patch_gptq_lora.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
|
||||||
|
|
||||||
|
import autograd_4bit
|
||||||
|
from autograd_4bit import (Autograd4bitQuantLinear,
|
||||||
|
load_llama_model_4bit_low_ram)
|
||||||
|
from monkeypatch.peft_tuners_lora_monkey_patch import (
|
||||||
|
Linear4bitLt, replace_peft_model_with_gptq_lora_model)
|
||||||
|
|
||||||
|
from modules import shared
|
||||||
|
from modules.GPTQ_loader import find_quantized_model_file
|
||||||
|
|
||||||
|
replace_peft_model_with_gptq_lora_model()
|
||||||
|
|
||||||
|
def load_model_llama(model_name):
|
||||||
|
|
||||||
|
config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
|
||||||
|
model_path = str(find_quantized_model_file(model_name))
|
||||||
|
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
|
||||||
|
|
||||||
|
for n, m in model.named_modules():
|
||||||
|
if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
|
||||||
|
if m.is_v1_model:
|
||||||
|
m.zeros = m.zeros.half()
|
||||||
|
m.scales = m.scales.half()
|
||||||
|
m.bias = m.bias.half()
|
||||||
|
autograd_4bit.use_new = True
|
||||||
|
autograd_4bit.auto_switch = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
tokenizer.eos_token_id = 2
|
||||||
|
tokenizer.bos_token_id = 1
|
||||||
|
tokenizer.pad_token_id = 0
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return model, tokenizer
|
@ -124,6 +124,7 @@ parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quan
|
|||||||
parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
|
parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
|
||||||
parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.')
|
parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.')
|
||||||
parser.add_argument('--no-warmup_autotune', action='store_true', help='GPTQ: Disable warmup autotune for triton.')
|
parser.add_argument('--no-warmup_autotune', action='store_true', help='GPTQ: Disable warmup autotune for triton.')
|
||||||
|
parser.add_argument('--monkey-patch', action='store_true', help='GPTQ: Apply the monkey patch for using LoRAs with quantized models.')
|
||||||
|
|
||||||
# FlexGen
|
# FlexGen
|
||||||
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
accelerate==0.18.0
|
accelerate==0.18.0
|
||||||
|
colorama
|
||||||
datasets
|
datasets
|
||||||
flexgen==0.1.7
|
flexgen==0.1.7
|
||||||
gradio==3.25
|
gradio==3.25.0
|
||||||
markdown
|
markdown
|
||||||
numpy
|
numpy
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
|
Loading…
Reference in New Issue
Block a user