llamacpp_HF: do not use oobabooga/llama-tokenizer (#5499)

2024-11-22 08:07:56 +01:00 · 2024-02-14 00:28:51 -03:00 · 2024-02-14 00:28:51 -03:00 · d47182d9d1
commit d47182d9d1
parent 3a9ce3cfa6
2 changed files with 8 additions and 17 deletions
--- a/modules/models.py
+++ b/modules/models.py
@ -254,26 +254,17 @@ def llamacpp_loader(model_name):
 def llamacpp_HF_loader(model_name):
    from modules.llamacpp_hf import LlamacppHF

-    for fname in [model_name, "oobabooga_llama-tokenizer", "llama-tokenizer"]:
-        path = Path(f'{shared.args.model_dir}/{fname}')
-        if all((path / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+
+    # Check if a HF tokenizer is available for the model
+    if all((path / file).exists() for file in ['tokenizer.model', 'tokenizer_config.json']):
        logger.info(f'Using tokenizer from: \"{path}\"')
-            break
    else:
-        logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
+        logger.error("Could not load the model because a tokenizer in Transformers format was not found.")
        return None, None

-    if shared.args.no_use_fast:
-        logger.info('Loading the tokenizer with use_fast=False.')
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        path,
-        trust_remote_code=shared.args.trust_remote_code,
-        use_fast=not shared.args.no_use_fast
-    )
-
    model = LlamacppHF.from_pretrained(model_name)
-    return model, tokenizer
+    return model


 def ctransformers_loader(model_name):
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -143,7 +143,7 @@ def create_ui():
                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
-                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')
+                            shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, download a tokenizer in HF format for your GGUF:\n\n1. Create a folder inside models/\n2. Place your GGUF in the new folder.\n3. Add the original model's tokenizer files there: `tokenizer.model`, `tokenizer_config.json`, `tokenizer.json`, and `special_tokens_map.json`.")

            with gr.Column():
                with gr.Row():