mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 16:17:57 +01:00
Add AutoGPTQ support (basic) (#2132)
This commit is contained in:
parent
10cf7831f7
commit
1a8151a2b6
41
modules/AutoGPTQ_loader.py
Normal file
41
modules/AutoGPTQ_loader.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from auto_gptq import AutoGPTQForCausalLM
|
||||||
|
|
||||||
|
import modules.shared as shared
|
||||||
|
from modules.models import get_max_memory_dict
|
||||||
|
|
||||||
|
|
||||||
|
def load_quantized(model_name):
|
||||||
|
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
|
pt_path = None
|
||||||
|
use_safetensors = False
|
||||||
|
|
||||||
|
# Find the model checkpoint
|
||||||
|
found_pts = list(path_to_model.glob("*.pt"))
|
||||||
|
found_safetensors = list(path_to_model.glob("*.safetensors"))
|
||||||
|
if len(found_safetensors) > 0:
|
||||||
|
if len(found_pts) > 1:
|
||||||
|
logging.warning('More than one .safetensors model has been found. The last one will be selected. It could be wrong.')
|
||||||
|
|
||||||
|
use_safetensors = True
|
||||||
|
pt_path = found_safetensors[-1]
|
||||||
|
elif len(found_pts) > 0:
|
||||||
|
if len(found_pts) > 1:
|
||||||
|
logging.warning('More than one .pt model has been found. The last one will be selected. It could be wrong.')
|
||||||
|
|
||||||
|
pt_path = found_pts[-1]
|
||||||
|
|
||||||
|
# Define the params for AutoGPTQForCausalLM.from_quantized
|
||||||
|
params = {
|
||||||
|
'model_basename': pt_path.stem,
|
||||||
|
'device': "cuda:0" if not shared.args.cpu else "cpu",
|
||||||
|
'use_triton': shared.args.triton,
|
||||||
|
'use_safetensors': use_safetensors,
|
||||||
|
'max_memory': get_max_memory_dict()
|
||||||
|
}
|
||||||
|
|
||||||
|
logging.warning(f"The AutoGPTQ params are: {params}")
|
||||||
|
model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
|
||||||
|
return model
|
@ -72,6 +72,9 @@ def load_model(model_name):
|
|||||||
|
|
||||||
shared.model_type = find_model_type(model_name)
|
shared.model_type = find_model_type(model_name)
|
||||||
if shared.args.wbits > 0:
|
if shared.args.wbits > 0:
|
||||||
|
if shared.args.autogptq:
|
||||||
|
load_func = AutoGPTQ_loader
|
||||||
|
else:
|
||||||
load_func = GPTQ_loader
|
load_func = GPTQ_loader
|
||||||
elif shared.model_type == 'llamacpp':
|
elif shared.model_type == 'llamacpp':
|
||||||
load_func = llamacpp_loader
|
load_func = llamacpp_loader
|
||||||
@ -261,6 +264,12 @@ def GPTQ_loader(model_name):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def AutoGPTQ_loader(model_name):
|
||||||
|
from modules.AutoGPTQ_loader import load_quantized
|
||||||
|
|
||||||
|
return load_quantized(model_name)
|
||||||
|
|
||||||
|
|
||||||
def get_max_memory_dict():
|
def get_max_memory_dict():
|
||||||
max_memory = {}
|
max_memory = {}
|
||||||
if shared.args.gpu_memory:
|
if shared.args.gpu_memory:
|
||||||
@ -283,7 +292,7 @@ def get_max_memory_dict():
|
|||||||
logging.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
|
logging.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
|
||||||
max_memory = {0: f'{suggestion}GiB', 'cpu': f'{shared.args.cpu_memory or 99}GiB'}
|
max_memory = {0: f'{suggestion}GiB', 'cpu': f'{shared.args.cpu_memory or 99}GiB'}
|
||||||
|
|
||||||
return max_memory
|
return max_memory if len(max_memory) > 0 else None
|
||||||
|
|
||||||
|
|
||||||
def clear_torch_cache():
|
def clear_torch_cache():
|
||||||
|
@ -137,6 +137,10 @@ parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable q
|
|||||||
parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
|
parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
|
||||||
parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
|
parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
|
||||||
|
|
||||||
|
# AutoGPTQ
|
||||||
|
parser.add_argument('--autogptq', action='store_true', help='Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader.')
|
||||||
|
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
||||||
|
|
||||||
# FlexGen
|
# FlexGen
|
||||||
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
|
||||||
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
|
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
|
||||||
|
Loading…
Reference in New Issue
Block a user