From 8b66d83aa92fdf33303994a4b72781aac3ddcc21 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:45:05 -0800 Subject: [PATCH] Set use_fast=True by default, create --no_use_fast flag This increases tokens/second for HF loaders. --- docs/04 - Model Tab.md | 2 +- modules/loaders.py | 14 +++++++------- modules/models.py | 12 ++++++------ modules/shared.py | 5 +++-- modules/ui.py | 2 +- modules/ui_model_menu.py | 4 ++-- 6 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 2587eedc..fcc10ef0 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -28,7 +28,7 @@ Options: * **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined. * **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes. * **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible. -* **use_fast**: Use the "fast" version of the tokenizer. Especially useful for Llama models, which originally had a "slow" tokenizer that received an update. If your local files are in the old "slow" format, checking this option may trigger a conversion that takes several minutes. The fast tokenizer is mostly useful if you are generating 50+ tokens/second using ExLlama_HF or if you are tokenizing a huge dataset for training. +* **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise. * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training. * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model. diff --git a/modules/loaders.py b/modules/loaders.py index 607a63d3..2f1648c7 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({ 'quant_type', 'compute_dtype', 'trust_remote_code', - 'use_fast', + 'no_use_fast', 'use_flash_attention_2', 'alpha_value', 'rope_freq_base', @@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({ 'rope_freq_base', 'compress_pos_emb', 'cfg_cache', - 'use_fast', + 'no_use_fast', 'exllama_HF_info', ], 'ExLlamav2_HF': [ @@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({ 'cache_8bit', 'alpha_value', 'compress_pos_emb', - 'use_fast', + 'no_use_fast', ], 'ExLlama': [ 'gpu_split', @@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({ 'disk', 'auto_devices', 'trust_remote_code', - 'use_fast', + 'no_use_fast', 'autogptq_info', ], 'GPTQ-for-LLaMa': [ @@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({ 'groupsize', 'model_type', 'pre_layer', - 'use_fast', + 'no_use_fast', 'gptq_for_llama_info', ], 'llama.cpp': [ @@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({ 'compress_pos_emb', 'numa', 'cfg_cache', - 'use_fast', + 'no_use_fast', 'logits_all', 'llamacpp_HF_info', ], @@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({ 'max_seq_len', 'no_inject_fused_attention', 'trust_remote_code', - 'use_fast', + 'no_use_fast', ] }) diff --git a/modules/models.py b/modules/models.py index e4c3ddaa..19c0d903 100644 --- a/modules/models.py +++ b/modules/models.py @@ -114,13 +114,13 @@ def load_tokenizer(model_name, model): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif path_to_model.exists(): - if shared.args.use_fast: - logger.info('Loading the tokenizer with use_fast=True.') + if shared.args.no_use_fast: + logger.info('Loading the tokenizer with use_fast=False.') tokenizer = AutoTokenizer.from_pretrained( path_to_model, trust_remote_code=shared.args.trust_remote_code, - use_fast=shared.args.use_fast + use_fast=not shared.args.no_use_fast ) return tokenizer @@ -262,13 +262,13 @@ def llamacpp_HF_loader(model_name): logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.") return None, None - if shared.args.use_fast: - logger.info('Loading the tokenizer with use_fast=True.') + if shared.args.no_use_fast: + logger.info('Loading the tokenizer with use_fast=False.') tokenizer = AutoTokenizer.from_pretrained( path, trust_remote_code=shared.args.trust_remote_code, - use_fast=shared.args.use_fast + use_fast=not shared.args.no_use_fast ) model = LlamacppHF.from_pretrained(model_name) diff --git a/modules/shared.py b/modules/shared.py index b4750b26..09cf006a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -93,7 +93,7 @@ parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memo parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') -parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.') +parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Set this if you have any problems related to use_fast.') parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') # Accelerate 4-bit @@ -182,6 +182,7 @@ parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED') parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED') parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED') parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED') +parser.add_argument('--use_fast', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -192,7 +193,7 @@ for arg in sys.argv[1:]: provided_arguments.append(arg) # Deprecation warnings -for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']: +for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']: if getattr(args, k): logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.') diff --git a/modules/ui.py b/modules/ui.py index 5984a588..383bc66f 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -52,7 +52,7 @@ def list_model_elements(): 'bf16', 'load_in_8bit', 'trust_remote_code', - 'use_fast', + 'no_use_fast', 'use_flash_attention_2', 'load_in_4bit', 'compute_dtype', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 3e4c6f8e..12edeed9 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -109,7 +109,6 @@ def create_ui(): shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') - shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.') shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') @@ -122,12 +121,13 @@ def create_ui(): shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) - shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') + shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.') shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') + shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).') shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')