text-generation-webui/modules/GPTQ_loader.py

import inspect
import re
import sys
from pathlib import Path

import accelerate
import torch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM

import modules.shared as shared

sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
import llama_inference_offload
from modelutils import find_layers
from quant import make_quant


# This function is a replacement for the load_quant function in the
# GPTQ-for_LLaMa repository. It supports more models and branches.
def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):

    def noop(*args, **kwargs):
        pass

    config = AutoConfig.from_pretrained(model)
    torch.nn.init.kaiming_uniform_ = noop
    torch.nn.init.uniform_ = noop
    torch.nn.init.normal_ = noop

    torch.set_default_dtype(torch.half)
    transformers.modeling_utils._init_weights = False
    torch.set_default_dtype(torch.half)
    model = AutoModelForCausalLM.from_config(config)
    torch.set_default_dtype(torch.float)
    model = model.eval()
    layers = find_layers(model)
    for name in exclude_layers:
        if name in layers:
            del layers[name]

    gptq_args = inspect.getfullargspec(make_quant).args

    make_quant_kwargs = {
        'module': model,
        'names': layers,
        'bits': wbits,
    }
    if 'groupsize' in gptq_args:
        make_quant_kwargs['groupsize'] = groupsize
    if 'faster' in gptq_args:
        make_quant_kwargs['faster'] = faster_kernel
    if 'kernel_switch_threshold' in gptq_args:
        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold

    make_quant(**make_quant_kwargs)

    del layers

    print('Loading model ...')
    if checkpoint.endswith('.safetensors'):
        from safetensors.torch import load_file as safe_load
        model.load_state_dict(safe_load(checkpoint), strict=False)
    else:
        model.load_state_dict(torch.load(checkpoint), strict=False)

    try:
        from quant import autotune_warmup, make_quant_attn

        # triton branch
        make_quant_attn(model)
        if not shared.args.no_warmup_autotune:
            autotune_warmup(model)
    except ImportError:  # not triton branch
        pass

    model.seqlen = 2048
    print('Done.')

    return model


# Used to locate the .pt/.safetensors quantized file
def find_quantized_model_file(model_name):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    pt_path = None
    priority_name_list = [
        Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
        for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
        for ext in ['.safetensors', '.pt']
        for hyphen in ['-', f'/{model_name}-', '/']
    ]
    for path in priority_name_list:
        if path.exists():
            pt_path = path
            break

    # If the model hasn't been found with a well-behaved name, pick the last .pt
    # or the last .safetensors found in its folder as a last resort
    if not pt_path:
        found_pts = list(path_to_model.glob("*.pt"))
        found_safetensors = list(path_to_model.glob("*.safetensors"))
        pt_path = None

        if len(found_pts) > 0:
            if len(found_pts) > 1:
                print('Warning: more than one .pt model has been found. The last one will be selected. It could be wrong.')
            pt_path = found_pts[-1]
        elif len(found_safetensors) > 0:
            if len(found_pts) > 1:
                print('Warning: more than one .safetensors model has been found. The last one will be selected. It could be wrong.')
            pt_path = found_safetensors[-1]

    return pt_path


# The function that loads the model in modules/models.py
def load_quantized(model_name):

    # Find the model type
    if not shared.args.model_type:
        name = model_name.lower()
        if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):
            model_type = 'llama'
        elif any((k in name for k in ['opt-', 'galactica'])):
            model_type = 'opt'
        elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):
            model_type = 'gptj'
        else:
            print("Can't determine model type from model name. Please specify it manually using --model_type "
                  "argument")
            exit()
    else:
        model_type = shared.args.model_type.lower()

    # Select the appropriate load_quant function
    if shared.args.pre_layer and model_type == 'llama':
        load_quant = llama_inference_offload.load_quant
    elif model_type in ('llama', 'opt', 'gptj'):
        if shared.args.pre_layer:
            print("Warning: ignoring --pre_layer because it only works for llama model type.")
        load_quant = _load_quant
    else:
        print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
        exit()

    # Find the quantized model weights file (.pt/.safetensors)
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    pt_path = find_quantized_model_file(model_name)
    if not pt_path:
        print("Could not find the quantized model in .pt or .safetensors format, exiting...")
        exit()
    else:
        print(f"Found the following quantized model: {pt_path}")

    # qwopqwop200's offload
    if model_type == 'llama' and shared.args.pre_layer:
        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)
    else:
        threshold = False if model_type == 'gptj' else 128
        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)

        # accelerate offload (doesn't work properly)
        if shared.args.gpu_memory or torch.cuda.device_count() > 1:
            if shared.args.gpu_memory:
                memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
                max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
                max_memory = {}
                for i in range(len(memory_map)):
                    max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
                max_memory['cpu'] = max_cpu_memory
            else:
                max_memory = accelerate.utils.get_balanced_memory(model)

            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
            print("Using the following device map for the quantized model:", device_map)
            # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)

        # No offload
        elif not shared.args.cpu:
            model = model.to(torch.device('cuda:0'))

    return model
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`import inspect`
Attempt at fixing 4-bit with CPU offload 2023-03-20 19:11:56 +01:00			`import re`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00			`import sys`
			`from pathlib import Path`

			`import accelerate`
			`import torch`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`import transformers`
Reorder imports 2023-03-28 22:34:15 +02:00			`from transformers import AutoConfig, AutoModelForCausalLM`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
			`import modules.shared as shared`

Use str(Path) instead of os.path.abspath(Path) 2023-03-13 04:08:01 +01:00			`sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))`
Add -gptq-preload for 4-bit offloading (#460) This works in a 4GB card now: ``` python server.py --model llama-7b-hf --gptq-bits 4 --gptq-pre-layer 20 ``` 2023-03-20 20:30:56 +01:00			`import llama_inference_offload`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`from modelutils import find_layers`
Reorder imports 2023-03-28 22:34:15 +02:00			`from quant import make_quant`

Generalized load_quantized 2023-03-28 19:38:55 +02:00
Add 4-bit LoRA support (#1200) 2023-04-17 04:26:52 +02:00			`# This function is a replacement for the load_quant function in the`
			`# GPTQ-for_LLaMa repository. It supports more models and branches.`
Disable kernel threshold for gpt-j 2023-03-28 21:45:38 +02:00			`def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`def noop(args, *kwargs):`
			`pass`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
			`config = AutoConfig.from_pretrained(model)`
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`torch.nn.init.kaiming_uniform_ = noop`
			`torch.nn.init.uniform_ = noop`
			`torch.nn.init.normal_ = noop`
Generalized load_quantized 2023-03-28 19:38:55 +02:00
			`torch.set_default_dtype(torch.half)`
			`transformers.modeling_utils._init_weights = False`
			`torch.set_default_dtype(torch.half)`
			`model = AutoModelForCausalLM.from_config(config)`
			`torch.set_default_dtype(torch.float)`
			`model = model.eval()`
			`layers = find_layers(model)`
			`for name in exclude_layers:`
			`if name in layers:`
			`del layers[name]`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`gptq_args = inspect.getfullargspec(make_quant).args`
Generalized load_quantized 2023-03-28 19:38:55 +02:00
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`make_quant_kwargs = {`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00			`'module': model,`
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`'names': layers,`
			`'bits': wbits,`
			`}`
			`if 'groupsize' in gptq_args:`
			`make_quant_kwargs['groupsize'] = groupsize`
			`if 'faster' in gptq_args:`
			`make_quant_kwargs['faster'] = faster_kernel`
			`if 'kernel_switch_threshold' in gptq_args:`
			`make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
Broaden GPTQ-for-LLaMA branch support (#820) 2023-04-06 17:16:48 +02:00			`make_quant(**make_quant_kwargs)`

			`del layers`

Generalized load_quantized 2023-03-28 19:38:55 +02:00			`print('Loading model ...')`
			`if checkpoint.endswith('.safetensors'):`
			`from safetensors.torch import load_file as safe_load`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00			`model.load_state_dict(safe_load(checkpoint), strict=False)`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`else:`
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00			`model.load_state_dict(torch.load(checkpoint), strict=False)`
Keep minimal change. 2023-04-12 17:26:06 +02:00
			`try:`
			`from quant import autotune_warmup, make_quant_attn`
Add 4-bit LoRA support (#1200) 2023-04-17 04:26:52 +02:00
Keep minimal change. 2023-04-12 17:26:06 +02:00			`# triton branch`
			`make_quant_attn(model)`
Change warmup_autotune to a negative switch. 2023-04-13 14:59:49 +02:00			`if not shared.args.no_warmup_autotune:`
Keep minimal change. 2023-04-12 17:26:06 +02:00			`autotune_warmup(model)`
			`except ImportError: # not triton branch`
			`pass`

Generalized load_quantized 2023-03-28 19:38:55 +02:00			`model.seqlen = 2048`
			`print('Done.')`

			`return model`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
Make the code more like PEP8 for readability (#862) 2023-04-07 05:15:45 +02:00
Add 4-bit LoRA support (#1200) 2023-04-17 04:26:52 +02:00			`# Used to locate the .pt/.safetensors quantized file`
			`def find_quantized_model_file(model_name):`
			`path_to_model = Path(f'{shared.args.model_dir}/{model_name}')`
			`pt_path = None`
			`priority_name_list = [`
			`Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')`
			`for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])`
			`for ext in ['.safetensors', '.pt']`
			`for hyphen in ['-', f'/{model_name}-', '/']`
			`]`
			`for path in priority_name_list:`
			`if path.exists():`
			`pt_path = path`
			`break`

			`# If the model hasn't been found with a well-behaved name, pick the last .pt`
			`# or the last .safetensors found in its folder as a last resort`
			`if not pt_path:`
			`found_pts = list(path_to_model.glob("*.pt"))`
			`found_safetensors = list(path_to_model.glob("*.safetensors"))`
			`pt_path = None`

			`if len(found_pts) > 0:`
			`if len(found_pts) > 1:`
			`print('Warning: more than one .pt model has been found. The last one will be selected. It could be wrong.')`
			`pt_path = found_pts[-1]`
			`elif len(found_safetensors) > 0:`
			`if len(found_pts) > 1:`
			`print('Warning: more than one .safetensors model has been found. The last one will be selected. It could be wrong.')`
			`pt_path = found_safetensors[-1]`

			`return pt_path`


			`# The function that loads the model in modules/models.py`
determine model type from model name 2023-03-13 20:11:32 +01:00			`def load_quantized(model_name):`
Add some comments, remove obsolete code 2023-04-13 16:17:32 +02:00
			`# Find the model type`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`if not shared.args.model_type:`
Adapt to the new model names 2023-03-30 02:47:36 +02:00			`name = model_name.lower()`
Detect "vicuna" as llama model type (#772) 2023-04-04 18:23:27 +02:00			`if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`model_type = 'llama'`
Adapt to the new model names 2023-03-30 02:47:36 +02:00			`elif any((k in name for k in ['opt-', 'galactica'])):`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`model_type = 'opt'`
Adapt to the new model names 2023-03-30 02:47:36 +02:00			`elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`model_type = 'gptj'`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`else:`
			`print("Can't determine model type from model name. Please specify it manually using --model_type "`
determine model type from model name 2023-03-13 20:11:32 +01:00			`"argument")`
			`exit()`
			`else:`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`model_type = shared.args.model_type.lower()`
determine model type from model name 2023-03-13 20:11:32 +01:00
Add some comments, remove obsolete code 2023-04-13 16:17:32 +02:00			`# Select the appropriate load_quant function`
Minor rewrite 2023-04-05 06:21:40 +02:00			`if shared.args.pre_layer and model_type == 'llama':`
			`load_quant = llama_inference_offload.load_quant`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`elif model_type in ('llama', 'opt', 'gptj'):`
Minor rewrite 2023-04-05 06:21:40 +02:00			`if shared.args.pre_layer:`
			`print("Warning: ignoring --pre_layer because it only works for llama model type.")`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`load_quant = _load_quant`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00			`else:`
Generalized load_quantized 2023-03-28 19:38:55 +02:00			`print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")`
refactor quant models loader and add support of OPT 2023-03-13 17:59:57 +01:00			`exit()`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
Add 4-bit LoRA support (#1200) 2023-04-17 04:26:52 +02:00			`# Find the quantized model weights file (.pt/.safetensors)`
allow quantized model to be loaded from model dir (#760) 2023-04-05 04:19:38 +02:00			`path_to_model = Path(f'{shared.args.model_dir}/{model_name}')`
Add 4-bit LoRA support (#1200) 2023-04-17 04:26:52 +02:00			`pt_path = find_quantized_model_file(model_name)`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00			`if not pt_path:`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`print("Could not find the quantized model in .pt or .safetensors format, exiting...")`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00			`exit()`
More robust 4-bit model loading 2023-04-10 04:19:28 +02:00			`else:`
			`print(f"Found the following quantized model: {pt_path}")`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
Update comments 2023-03-20 20:40:08 +01:00			`# qwopqwop200's offload`
Disable pre_layer when the model type is not llama 2023-04-05 06:19:26 +02:00			`if model_type == 'llama' and shared.args.pre_layer:`
Add support for the latest GPTQ models with group-size (#530) Warning: old 4-bit weights will not work anymore! See here how to get up to date weights: https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model#step-2-get-the-pre-converted-weights 2023-03-26 05:11:33 +02:00			`model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)`
Add -gptq-preload for 4-bit offloading (#460) This works in a 4GB card now: ``` python server.py --model llama-7b-hf --gptq-bits 4 --gptq-pre-layer 20 ``` 2023-03-20 20:30:56 +01:00			`else:`
Disable kernel threshold for gpt-j 2023-03-28 21:45:38 +02:00			`threshold = False if model_type == 'gptj' else 128`
			`model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
Update comments 2023-03-20 20:40:08 +01:00			`# accelerate offload (doesn't work properly)`
Better dispatch. 2023-04-12 19:48:17 +02:00			`if shared.args.gpu_memory or torch.cuda.device_count() > 1:`
			`if shared.args.gpu_memory:`
			`memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))`
			`max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'`
			`max_memory = {}`
			`for i in range(len(memory_map)):`
			`max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]`
			`max_memory['cpu'] = max_cpu_memory`
			`else:`
			`max_memory = accelerate.utils.get_balanced_memory(model)`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
Add -gptq-preload for 4-bit offloading (#460) This works in a 4GB card now: ``` python server.py --model llama-7b-hf --gptq-bits 4 --gptq-pre-layer 20 ``` 2023-03-20 20:30:56 +01:00			`device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])`
Better dispatch. 2023-04-12 19:48:17 +02:00			`print("Using the following device map for the quantized model:", device_map)`
Add -gptq-preload for 4-bit offloading (#460) This works in a 4GB card now: ``` python server.py --model llama-7b-hf --gptq-bits 4 --gptq-pre-layer 20 ``` 2023-03-20 20:30:56 +01:00			`# https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model`
			`model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)`
Update comments 2023-03-20 20:40:08 +01:00
			`# No offload`
Add -gptq-preload for 4-bit offloading (#460) This works in a 4GB card now: ``` python server.py --model llama-7b-hf --gptq-bits 4 --gptq-pre-layer 20 ``` 2023-03-20 20:30:56 +01:00			`elif not shared.args.cpu:`
			`model = model.to(torch.device('cuda:0'))`
Move LLaMA 4-bit into a separate file 2023-03-12 15:12:34 +01:00
			`return model`