diff --git a/README.md b/README.md index 280d9867..deba55af 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,7 @@ Optionally, you can use the following command-line flags: | `--xformers` | Use xformer's memory efficient attention. This should increase your tokens/s. | | `--sdp-attention` | Use torch 2.0's sdp attention. | | `--trust-remote-code` | Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. | +| `--use_fast` | Set use_fast=True while loading a tokenizer. | #### Accelerate 4-bit diff --git a/modules/loaders.py b/modules/loaders.py index 7d1b2d96..3104ca56 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({ 'quant_type', 'compute_dtype', 'trust_remote_code', + 'use_fast', 'alpha_value', 'rope_freq_base', 'compress_pos_emb', @@ -33,6 +34,7 @@ loaders_and_params = OrderedDict({ 'rope_freq_base', 'compress_pos_emb', 'cfg_cache', + 'use_fast', 'exllama_HF_info', ], 'ExLlamav2_HF': [ @@ -41,6 +43,7 @@ loaders_and_params = OrderedDict({ 'cfg_cache', 'alpha_value', 'compress_pos_emb', + 'use_fast', ], 'ExLlama': [ 'gpu_split', @@ -71,6 +74,7 @@ loaders_and_params = OrderedDict({ 'disk', 'auto_devices', 'trust_remote_code', + 'use_fast', 'autogptq_info', ], 'GPTQ-for-LLaMa': [ @@ -78,6 +82,7 @@ loaders_and_params = OrderedDict({ 'groupsize', 'model_type', 'pre_layer', + 'use_fast', 'gptq_for_llama_info', ], 'llama.cpp': [ @@ -111,6 +116,7 @@ loaders_and_params = OrderedDict({ 'compress_pos_emb', 'cpu', 'cfg_cache', + 'use_fast', 'llamacpp_HF_info', ], 'ctransformers': [ diff --git a/modules/models.py b/modules/models.py index c0d867b7..06dfe994 100644 --- a/modules/models.py +++ b/modules/models.py @@ -99,18 +99,14 @@ def load_tokenizer(model_name, model): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif path_to_model.exists(): - try: - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code, - use_fast=False - ) - except ValueError: - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code, - use_fast=True - ) + if shared.args.use_fast: + logger.info('Loading the tokenizer with use_fast=True.') + + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code, + use_fast=shared.args.use_fast + ) return tokenizer @@ -249,10 +245,13 @@ def llamacpp_HF_loader(model_name): logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.") return None, None + if shared.args.use_fast: + logger.info('Loading the tokenizer with use_fast=True.') + tokenizer = AutoTokenizer.from_pretrained( path, trust_remote_code=shared.args.trust_remote_code, - use_fast=False + use_fast=shared.args.use_fast ) model = LlamacppHF.from_pretrained(model_name) diff --git a/modules/shared.py b/modules/shared.py index e534af20..387a1f6b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -105,6 +105,7 @@ parser.add_argument('--no-cache', action='store_true', help='Set use_cache to Fa parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.") parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.") parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon.") +parser.add_argument('--use_fast', action='store_true', help="Set use_fast=True while loading a tokenizer.") # Accelerate 4-bit parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') diff --git a/modules/ui.py b/modules/ui.py index afb8a1ef..823d5d0f 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -52,6 +52,7 @@ def list_model_elements(): 'bf16', 'load_in_8bit', 'trust_remote_code', + 'use_fast', 'load_in_4bit', 'compute_dtype', 'quant_type', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index b57e11f4..4be48d8b 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -115,6 +115,7 @@ def create_ui(): shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') + shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')