From d0d221df49e930303703f748260a76fa37586ce3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Sep 2023 12:19:43 -0700
Subject: [PATCH] Add --use_fast option (closes #3741)

---
 README.md                |  1 +
 modules/loaders.py       |  6 ++++++
 modules/models.py        | 25 ++++++++++++-------------
 modules/shared.py        |  1 +
 modules/ui.py            |  1 +
 modules/ui_model_menu.py |  1 +
 6 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 280d9867..deba55af 100644
--- a/README.md
+++ b/README.md
@@ -269,6 +269,7 @@ Optionally, you can use the following command-line flags:
 | `--xformers`                                | Use xformer's memory efficient attention. This should increase your tokens/s. |
 | `--sdp-attention`                           | Use torch 2.0's sdp attention. |
 | `--trust-remote-code`                       | Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. |
+| `--use_fast`                                | Set use_fast=True while loading a tokenizer. |
 
 #### Accelerate 4-bit
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 7d1b2d96..3104ca56 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
         'quant_type',
         'compute_dtype',
         'trust_remote_code',
+        'use_fast',
         'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
@@ -33,6 +34,7 @@ loaders_and_params = OrderedDict({
         'rope_freq_base',
         'compress_pos_emb',
         'cfg_cache',
+        'use_fast',
         'exllama_HF_info',
     ],
     'ExLlamav2_HF': [
@@ -41,6 +43,7 @@ loaders_and_params = OrderedDict({
         'cfg_cache',
         'alpha_value',
         'compress_pos_emb',
+        'use_fast',
     ],
     'ExLlama': [
         'gpu_split',
@@ -71,6 +74,7 @@ loaders_and_params = OrderedDict({
         'disk',
         'auto_devices',
         'trust_remote_code',
+        'use_fast',
         'autogptq_info',
     ],
     'GPTQ-for-LLaMa': [
@@ -78,6 +82,7 @@ loaders_and_params = OrderedDict({
         'groupsize',
         'model_type',
         'pre_layer',
+        'use_fast',
         'gptq_for_llama_info',
     ],
     'llama.cpp': [
@@ -111,6 +116,7 @@ loaders_and_params = OrderedDict({
         'compress_pos_emb',
         'cpu',
         'cfg_cache',
+        'use_fast',
         'llamacpp_HF_info',
     ],
     'ctransformers': [
diff --git a/modules/models.py b/modules/models.py
index c0d867b7..06dfe994 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -99,18 +99,14 @@ def load_tokenizer(model_name, model):
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
     elif path_to_model.exists():
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(
-                path_to_model,
-                trust_remote_code=shared.args.trust_remote_code,
-                use_fast=False
-            )
-        except ValueError:
-            tokenizer = AutoTokenizer.from_pretrained(
-                path_to_model,
-                trust_remote_code=shared.args.trust_remote_code,
-                use_fast=True
-            )
+        if shared.args.use_fast:
+            logger.info('Loading the tokenizer with use_fast=True.')
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            path_to_model,
+            trust_remote_code=shared.args.trust_remote_code,
+            use_fast=shared.args.use_fast
+        )
 
     return tokenizer
 
@@ -249,10 +245,13 @@ def llamacpp_HF_loader(model_name):
         logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
         return None, None
 
+    if shared.args.use_fast:
+        logger.info('Loading the tokenizer with use_fast=True.')
+
     tokenizer = AutoTokenizer.from_pretrained(
         path,
         trust_remote_code=shared.args.trust_remote_code,
-        use_fast=False
+        use_fast=shared.args.use_fast
     )
 
     model = LlamacppHF.from_pretrained(model_name)
diff --git a/modules/shared.py b/modules/shared.py
index e534af20..387a1f6b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -105,6 +105,7 @@ parser.add_argument('--no-cache', action='store_true', help='Set use_cache to Fa
 parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
 parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
 parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon.")
+parser.add_argument('--use_fast', action='store_true', help="Set use_fast=True while loading a tokenizer.")
 
 # Accelerate 4-bit
 parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
diff --git a/modules/ui.py b/modules/ui.py
index afb8a1ef..823d5d0f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -52,6 +52,7 @@ def list_model_elements():
         'bf16',
         'load_in_8bit',
         'trust_remote_code',
+        'use_fast',
         'load_in_4bit',
         'compute_dtype',
         'quant_type',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b57e11f4..4be48d8b 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -115,6 +115,7 @@ def create_ui():
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
+                            shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')