mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-01-26 12:22:08 +01:00
Add HQQ quant loader (#4888)
--------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
This commit is contained in:
parent
64a57d9dc2
commit
674be9a09a
@ -305,6 +305,12 @@ List of command-line flags
|
||||
|-------------|-------------|
|
||||
| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gptneox, falcon, llama, mpt, starcoder (gptbigcode), dollyv2, and replit are supported. |
|
||||
|
||||
#### HQQ
|
||||
|
||||
| Flag | Description |
|
||||
|-------------|-------------|
|
||||
| `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. |
|
||||
|
||||
#### DeepSpeed
|
||||
|
||||
| Flag | Description |
|
||||
|
@ -155,6 +155,11 @@ loaders_and_params = OrderedDict({
|
||||
'trust_remote_code',
|
||||
'no_use_fast',
|
||||
'no_flash_attn',
|
||||
],
|
||||
'HQQ': [
|
||||
'hqq_backend',
|
||||
'trust_remote_code',
|
||||
'no_use_fast',
|
||||
]
|
||||
})
|
||||
|
||||
@ -503,6 +508,43 @@ loaders_samplers = {
|
||||
'skip_special_tokens',
|
||||
'auto_max_new_tokens',
|
||||
},
|
||||
'HQQ': {
|
||||
'temperature',
|
||||
'temperature_last',
|
||||
'top_p',
|
||||
'min_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'epsilon_cutoff',
|
||||
'eta_cutoff',
|
||||
'tfs',
|
||||
'top_a',
|
||||
'repetition_penalty',
|
||||
'presence_penalty',
|
||||
'frequency_penalty',
|
||||
'repetition_penalty_range',
|
||||
'encoder_repetition_penalty',
|
||||
'no_repeat_ngram_size',
|
||||
'min_length',
|
||||
'seed',
|
||||
'do_sample',
|
||||
'penalty_alpha',
|
||||
'num_beams',
|
||||
'length_penalty',
|
||||
'early_stopping',
|
||||
'mirostat_mode',
|
||||
'mirostat_tau',
|
||||
'mirostat_eta',
|
||||
'grammar_file_row',
|
||||
'grammar_string',
|
||||
'guidance_scale',
|
||||
'negative_prompt',
|
||||
'ban_eos_token',
|
||||
'custom_token_bans',
|
||||
'add_bos_token',
|
||||
'skip_special_tokens',
|
||||
'auto_max_new_tokens',
|
||||
},
|
||||
}
|
||||
|
||||
loaders_model_types = {
|
||||
|
@ -73,6 +73,7 @@ def load_model(model_name, loader=None):
|
||||
'ctransformers': ctransformers_loader,
|
||||
'AutoAWQ': AutoAWQ_loader,
|
||||
'QuIP#': QuipSharp_loader,
|
||||
'HQQ': HQQ_loader,
|
||||
}
|
||||
|
||||
metadata = get_model_metadata(model_name)
|
||||
@ -411,6 +412,18 @@ def ExLlamav2_HF_loader(model_name):
|
||||
return Exllamav2HF.from_pretrained(model_name)
|
||||
|
||||
|
||||
def HQQ_loader(model_name):
|
||||
from hqq.engine.hf import HQQModelForCausalLM
|
||||
from hqq.core.quantize import HQQLinear, HQQBackend
|
||||
|
||||
logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
|
||||
|
||||
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
model = HQQModelForCausalLM.from_quantized(str(model_dir))
|
||||
HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
|
||||
return model
|
||||
|
||||
|
||||
def RWKV_loader(model_name):
|
||||
'''
|
||||
This loader is not currently maintained as RWKV can now be loaded
|
||||
|
@ -163,6 +163,8 @@ def infer_loader(model_name, model_settings):
|
||||
loader = 'RWKV'
|
||||
elif re.match(r'.*exl2', model_name.lower()):
|
||||
loader = 'ExLlamav2_HF'
|
||||
elif re.match(r'.*-hqq', model_name.lower()):
|
||||
return 'HQQ'
|
||||
else:
|
||||
loader = 'Transformers'
|
||||
|
||||
|
@ -144,6 +144,9 @@ parser.add_argument('--pre_layer', type=int, nargs='+', help='The number of laye
|
||||
parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
|
||||
parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
|
||||
|
||||
# HQQ
|
||||
parser.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
||||
|
||||
# DeepSpeed
|
||||
parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
|
||||
parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
|
||||
@ -246,6 +249,8 @@ def fix_loader_name(name):
|
||||
return 'AutoAWQ'
|
||||
elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']:
|
||||
return 'QuIP#'
|
||||
elif name in ['hqq']:
|
||||
return 'HQQ'
|
||||
|
||||
|
||||
def add_extension(name, last=False):
|
||||
|
@ -91,6 +91,7 @@ def list_model_elements():
|
||||
'rope_freq_base',
|
||||
'numa',
|
||||
'logits_all',
|
||||
'hqq_backend',
|
||||
]
|
||||
if is_torch_xpu_available():
|
||||
for i in range(torch.xpu.device_count()):
|
||||
|
@ -84,6 +84,7 @@ def create_ui():
|
||||
shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
|
||||
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
|
||||
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
|
||||
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
||||
|
||||
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
|
||||
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
@ -4,6 +4,7 @@ datasets
|
||||
einops
|
||||
exllamav2==0.0.11
|
||||
gradio==3.50.*
|
||||
hqq==0.1.1
|
||||
markdown
|
||||
numpy==1.24.*
|
||||
optimum==1.15.*
|
||||
|
Loading…
Reference in New Issue
Block a user