text-generation-webui/modules/AutoGPTQ_loader.py

import logging
from pathlib import Path

from auto_gptq import AutoGPTQForCausalLM

import modules.shared as shared
from modules.models import get_max_memory_dict


def load_quantized(model_name):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    pt_path = None
    use_safetensors = False

    # Find the model checkpoint
    found_pts = list(path_to_model.glob("*.pt"))
    found_safetensors = list(path_to_model.glob("*.safetensors"))
    if len(found_safetensors) > 0:
        if len(found_safetensors) > 1:
            logging.warning('More than one .safetensors model has been found. The last one will be selected. It could be wrong.')

        use_safetensors = True
        pt_path = found_safetensors[-1]
    elif len(found_pts) > 0:
        if len(found_pts) > 1:
            logging.warning('More than one .pt model has been found. The last one will be selected. It could be wrong.')

        pt_path = found_pts[-1]

    # Define the params for AutoGPTQForCausalLM.from_quantized
    params = {
        'model_basename': pt_path.stem,
        'device': "cuda:0" if not shared.args.cpu else "cpu",
        'use_triton': shared.args.triton,
        'use_safetensors': use_safetensors,
        'max_memory': get_max_memory_dict()
    }

    logging.warning(f"The AutoGPTQ params are: {params}")
    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
    return model