mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-26 01:30:20 +01:00
Add q-cache 6 and 8 support for Exllamav2
This commit is contained in:
parent
608545d282
commit
3d17c80954
@ -7,6 +7,8 @@ from exllamav2 import (
|
|||||||
ExLlamaV2Cache,
|
ExLlamaV2Cache,
|
||||||
ExLlamaV2Cache_8bit,
|
ExLlamaV2Cache_8bit,
|
||||||
ExLlamaV2Cache_Q4,
|
ExLlamaV2Cache_Q4,
|
||||||
|
ExLlamaV2Cache_Q6,
|
||||||
|
ExLlamaV2Cache_Q8,
|
||||||
ExLlamaV2Config,
|
ExLlamaV2Config,
|
||||||
ExLlamaV2Tokenizer
|
ExLlamaV2Tokenizer
|
||||||
)
|
)
|
||||||
@ -63,8 +65,12 @@ class Exllamav2Model:
|
|||||||
|
|
||||||
if shared.args.cache_8bit:
|
if shared.args.cache_8bit:
|
||||||
cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit)
|
cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit)
|
||||||
elif shared.args.cache_4bit:
|
elif shared.args.cache_q4:
|
||||||
cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit)
|
cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit)
|
||||||
|
elif shared.args.cache_q6:
|
||||||
|
cache = ExLlamaV2Cache_Q6(model, lazy=shared.args.autosplit)
|
||||||
|
elif shared.args.cache_q8:
|
||||||
|
cache = ExLlamaV2Cache_Q8(model, lazy=shared.args.autosplit)
|
||||||
else:
|
else:
|
||||||
cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit)
|
cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit)
|
||||||
|
|
||||||
|
@ -9,6 +9,8 @@ from exllamav2 import (
|
|||||||
ExLlamaV2Cache,
|
ExLlamaV2Cache,
|
||||||
ExLlamaV2Cache_8bit,
|
ExLlamaV2Cache_8bit,
|
||||||
ExLlamaV2Cache_Q4,
|
ExLlamaV2Cache_Q4,
|
||||||
|
ExLlamaV2Cache_Q6,
|
||||||
|
ExLlamaV2Cache_Q8,
|
||||||
ExLlamaV2Config
|
ExLlamaV2Config
|
||||||
)
|
)
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
@ -51,8 +53,12 @@ class Exllamav2HF(PreTrainedModel):
|
|||||||
|
|
||||||
if shared.args.cache_8bit:
|
if shared.args.cache_8bit:
|
||||||
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit)
|
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit)
|
||||||
elif shared.args.cache_4bit:
|
elif shared.args.cache_q4:
|
||||||
self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit)
|
self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit)
|
||||||
|
elif shared.args.cache_q6:
|
||||||
|
self.ex_cache = ExLlamaV2Cache_Q6(self.ex_model, lazy=shared.args.autosplit)
|
||||||
|
elif shared.args.cache_q8:
|
||||||
|
self.ex_cache = ExLlamaV2Cache_Q8(self.ex_model, lazy=shared.args.autosplit)
|
||||||
else:
|
else:
|
||||||
self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit)
|
self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit)
|
||||||
|
|
||||||
@ -63,8 +69,12 @@ class Exllamav2HF(PreTrainedModel):
|
|||||||
if shared.args.cfg_cache:
|
if shared.args.cfg_cache:
|
||||||
if shared.args.cache_8bit:
|
if shared.args.cache_8bit:
|
||||||
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
|
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
|
||||||
elif shared.args.cache_4bit:
|
elif shared.args.cache_q4:
|
||||||
self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model)
|
self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model)
|
||||||
|
elif shared.args.cache_q6:
|
||||||
|
self.ex_cache_negative = ExLlamaV2Cache_Q6(self.ex_model)
|
||||||
|
elif shared.args.cache_q8:
|
||||||
|
self.ex_cache_negative = ExLlamaV2Cache_Q8(self.ex_model)
|
||||||
else:
|
else:
|
||||||
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
|
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
|
||||||
|
|
||||||
|
@ -88,7 +88,9 @@ loaders_and_params = OrderedDict({
|
|||||||
'no_sdpa',
|
'no_sdpa',
|
||||||
'num_experts_per_token',
|
'num_experts_per_token',
|
||||||
'cache_8bit',
|
'cache_8bit',
|
||||||
'cache_4bit',
|
'cache_q4',
|
||||||
|
'cache_q6',
|
||||||
|
'cache_q8',
|
||||||
'autosplit',
|
'autosplit',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
@ -103,7 +105,9 @@ loaders_and_params = OrderedDict({
|
|||||||
'no_sdpa',
|
'no_sdpa',
|
||||||
'num_experts_per_token',
|
'num_experts_per_token',
|
||||||
'cache_8bit',
|
'cache_8bit',
|
||||||
'cache_4bit',
|
'cache_q4',
|
||||||
|
'cache_q6',
|
||||||
|
'cache_q8',
|
||||||
'autosplit',
|
'autosplit',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
|
@ -142,8 +142,11 @@ group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Creat
|
|||||||
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
|
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
|
||||||
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
|
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
|
||||||
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
|
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
|
||||||
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
|
group.add_argument('--cache_4bit', action='store_true', help='Use 4-bit cache to save VRAM (llama.cpp).')
|
||||||
group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
|
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit (FP8) cache to save VRAM.')
|
||||||
|
group.add_argument('--cache_q4', action='store_true', help='Use Q4 cache to save VRAM.')
|
||||||
|
group.add_argument('--cache_q6', action='store_true', help='Use Q6 cache to save VRAM.')
|
||||||
|
group.add_argument('--cache_q8', action='store_true', help='Use Q8 cache to save VRAM.')
|
||||||
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
||||||
|
|
||||||
# AutoGPTQ
|
# AutoGPTQ
|
||||||
|
@ -89,6 +89,9 @@ def list_model_elements():
|
|||||||
'num_experts_per_token',
|
'num_experts_per_token',
|
||||||
'cache_8bit',
|
'cache_8bit',
|
||||||
'cache_4bit',
|
'cache_4bit',
|
||||||
|
'cache_q4',
|
||||||
|
'cache_q6',
|
||||||
|
'cache_q8',
|
||||||
'autosplit',
|
'autosplit',
|
||||||
'threads',
|
'threads',
|
||||||
'threads_batch',
|
'threads_batch',
|
||||||
|
@ -118,8 +118,11 @@ def create_ui():
|
|||||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||||
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
|
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
|
||||||
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
|
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
|
||||||
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
|
shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_8bit, info='Use 4-bit (FP4) cache to save VRAM.')
|
||||||
shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
|
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit (FP8) cache to save VRAM.')
|
||||||
|
shared.gradio['cache_q4'] = gr.Checkbox(label="cache_q4", value=shared.args.cache_q4, info='Use Q4 cache to save VRAM.')
|
||||||
|
shared.gradio['cache_q6'] = gr.Checkbox(label="cache_q6", value=shared.args.cache_q6, info='Use Q6 cache to save VRAM.')
|
||||||
|
shared.gradio['cache_q8'] = gr.Checkbox(label="cache_q8", value=shared.args.cache_q8, info='Use Q8 cache to save VRAM.')
|
||||||
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||||
shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
|
shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
|
||||||
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
||||||
|
Loading…
Reference in New Issue
Block a user