2024-05-05 07:07:48 +02:00
#!/usr/bin/env python3
2024-06-05 19:07:24 +02:00
# -*- coding: utf-8 -*-
2024-05-05 07:07:48 +02:00
2024-04-29 15:58:41 +02:00
# This script downloads the tokenizer models of the specified models from Huggingface and
2024-07-05 06:53:33 +02:00
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
2024-04-29 15:58:41 +02:00
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
2024-07-05 06:53:33 +02:00
# python3 convert_hf_to_gguf_update.py <huggingface_token>
2024-04-29 15:58:41 +02:00
#
2024-12-05 08:47:55 +01:00
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
2024-04-29 15:58:41 +02:00
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
#
2024-05-03 21:36:41 +02:00
import logging
2024-04-29 15:58:41 +02:00
import os
2024-05-17 14:11:45 +02:00
import pathlib
import re
2024-04-29 15:58:41 +02:00
import requests
import sys
import json
2024-09-11 14:29:51 +02:00
import shutil
2024-04-29 15:58:41 +02:00
from hashlib import sha256
from enum import IntEnum , auto
2024-05-03 21:36:41 +02:00
from transformers import AutoTokenizer
2024-05-04 07:32:32 +02:00
logging . basicConfig ( level = logging . DEBUG )
2024-07-05 06:53:33 +02:00
logger = logging . getLogger ( " convert_hf_to_gguf_update " )
2024-05-17 14:11:45 +02:00
sess = requests . Session ( )
2024-05-03 21:36:41 +02:00
2024-04-29 15:58:41 +02:00
class TOKENIZER_TYPE ( IntEnum ) :
SPM = auto ( )
BPE = auto ( )
WPM = auto ( )
2024-07-04 15:46:11 +02:00
UGM = auto ( )
2024-04-29 15:58:41 +02:00
2024-05-03 21:36:41 +02:00
2024-04-29 15:58:41 +02:00
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
2024-07-22 15:44:53 +02:00
CHK_TXT = ' \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ``````` \" \" \" \" ......!!!!!!?????? I \' ve been \' told he \' s there, \' RE you sure? \' M not sure I \' ll make it, \' D you like some tea? We \' Ve a \' lL '
2024-04-29 15:58:41 +02:00
if len ( sys . argv ) == 2 :
token = sys . argv [ 1 ]
2024-05-09 14:13:05 +02:00
if not token . startswith ( " hf_ " ) :
logger . info ( " Huggingface token seems invalid " )
2024-07-05 06:53:33 +02:00
logger . info ( " Usage: python convert_hf_to_gguf_update.py <huggingface_token> " )
2024-05-09 14:13:05 +02:00
sys . exit ( 1 )
2024-04-29 15:58:41 +02:00
else :
2024-07-05 06:53:33 +02:00
logger . info ( " Usage: python convert_hf_to_gguf_update.py <huggingface_token> " )
2024-04-29 15:58:41 +02:00
sys . exit ( 1 )
# TODO: add models here, base models preferred
models = [
2024-05-03 21:36:41 +02:00
{ " name " : " llama-spm " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/meta-llama/Llama-2-7b-hf " , } ,
{ " name " : " llama-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/meta-llama/Meta-Llama-3-8B " , } ,
{ " name " : " phi-3 " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/microsoft/Phi-3-mini-4k-instruct " , } ,
{ " name " : " deepseek-llm " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/deepseek-llm-7b-base " , } ,
{ " name " : " deepseek-coder " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base " , } ,
{ " name " : " falcon " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/tiiuae/falcon-7b " , } ,
{ " name " : " bert-bge " , " tokt " : TOKENIZER_TYPE . WPM , " repo " : " https://huggingface.co/BAAI/bge-small-en-v1.5 " , } ,
2024-12-22 23:09:58 +01:00
{ " name " : " falcon3 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/tiiuae/Falcon3-7B-Base " , } ,
2024-10-25 09:13:46 +02:00
{ " name " : " bert-bge-large " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/BAAI/bge-large-zh-v1.5 " , } ,
2024-05-03 21:36:41 +02:00
{ " name " : " mpt " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/mosaicml/mpt-7b " , } ,
{ " name " : " starcoder " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/bigcode/starcoder2-3b " , } ,
{ " name " : " gpt-2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/openai-community/gpt2 " , } ,
2024-05-21 18:53:48 +02:00
{ " name " : " stablelm2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b " , } ,
2024-05-04 07:32:32 +02:00
{ " name " : " refact " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/smallcloudai/Refact-1_6-base " , } ,
2024-05-05 07:19:30 +02:00
{ " name " : " command-r " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/CohereForAI/c4ai-command-r-v01 " , } ,
2024-05-08 14:06:43 +02:00
{ " name " : " qwen2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/Qwen/Qwen1.5-7B " , } ,
2024-05-07 21:39:43 +02:00
{ " name " : " olmo " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/allenai/OLMo-1.7-7B-hf " , } ,
2024-05-08 12:43:23 +02:00
{ " name " : " dbrx " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/databricks/dbrx-base " , } ,
2024-09-28 16:42:03 +02:00
{ " name " : " jina-v1-en " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-reranker-v1-tiny-en " , } ,
2024-05-13 10:35:14 +02:00
{ " name " : " jina-v2-en " , " tokt " : TOKENIZER_TYPE . WPM , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-en " , } , # WPM!
{ " name " : " jina-v2-es " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-es " , } ,
{ " name " : " jina-v2-de " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-de " , } ,
2024-05-26 14:28:35 +02:00
{ " name " : " smaug-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct " , } ,
2024-06-14 12:16:49 +02:00
{ " name " : " poro-chat " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LumiOpen/Poro-34B-chat " , } ,
2024-06-06 09:22:41 +02:00
{ " name " : " jina-v2-code " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-code " , } ,
2024-06-27 10:58:54 +02:00
{ " name " : " viking " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LumiOpen/Viking-7B " , } , # Also used for Viking 13B and 33B
2024-07-04 09:41:03 +02:00
{ " name " : " gemma " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/google/gemma-2b " , } ,
{ " name " : " gemma-2 " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/google/gemma-2-9b " , } ,
2024-07-02 16:36:00 +02:00
{ " name " : " jais " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/core42/jais-13b " , } ,
2024-07-04 15:46:11 +02:00
{ " name " : " t5 " , " tokt " : TOKENIZER_TYPE . UGM , " repo " : " https://huggingface.co/google-t5/t5-small " , } ,
2024-07-22 18:43:43 +02:00
{ " name " : " codeshell " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/WisdomShell/CodeShell-7B " , } ,
2024-07-20 15:43:51 +02:00
{ " name " : " tekken " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 " , } ,
2024-07-22 16:43:01 +02:00
{ " name " : " smollm " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/HuggingFaceTB/SmolLM-135M " , } ,
2024-08-15 09:17:12 +02:00
{ ' name ' : " bloom " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/bigscience/bloom " , } ,
{ ' name ' : " gpt3-finnish " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/TurkuNLP/gpt3-finnish-small " , } ,
2024-08-16 08:35:18 +02:00
{ " name " : " exaone " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct " , } ,
2024-09-12 13:28:20 +02:00
{ " name " : " phi-2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/microsoft/phi-2 " , } ,
2024-09-28 14:08:43 +02:00
{ " name " : " chameleon " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/facebook/chameleon-7b " , } ,
2024-12-05 19:30:59 +01:00
{ " name " : " minerva-7b " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 " , } ,
2024-12-07 08:02:14 +01:00
{ " name " : " roberta-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/sentence-transformers/stsb-roberta-base " } ,
2024-12-15 18:02:46 +01:00
{ " name " : " gigachat " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct " } ,
2024-12-23 01:35:44 +01:00
{ " name " : " megrez " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/Infinigence/Megrez-3B-Instruct " } ,
2025-01-04 21:06:11 +01:00
{ " name " : " deepseek-v3 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/DeepSeek-V3 " } ,
2024-05-03 21:36:41 +02:00
]
2024-04-29 15:58:41 +02:00
2024-05-03 21:36:41 +02:00
2024-04-29 15:58:41 +02:00
def download_file_with_auth ( url , token , save_path ) :
headers = { " Authorization " : f " Bearer { token } " }
2024-05-17 14:11:45 +02:00
response = sess . get ( url , headers = headers )
response . raise_for_status ( )
os . makedirs ( os . path . dirname ( save_path ) , exist_ok = True )
2024-07-22 15:44:53 +02:00
with open ( save_path , ' wb ' ) as downloaded_file :
downloaded_file . write ( response . content )
2024-05-17 14:11:45 +02:00
logger . info ( f " File { save_path } downloaded successfully " )
2024-05-03 21:36:41 +02:00
2024-04-29 15:58:41 +02:00
2024-05-17 14:11:45 +02:00
def download_model ( model ) :
2024-04-29 15:58:41 +02:00
name = model [ " name " ]
repo = model [ " repo " ]
tokt = model [ " tokt " ]
2024-05-17 14:11:45 +02:00
os . makedirs ( f " models/tokenizers/ { name } " , exist_ok = True )
2024-04-29 15:58:41 +02:00
2024-05-17 14:11:45 +02:00
files = [ " config.json " , " tokenizer.json " , " tokenizer_config.json " ]
2024-07-04 15:46:11 +02:00
2024-05-17 14:11:45 +02:00
if tokt == TOKENIZER_TYPE . SPM :
files . append ( " tokenizer.model " )
2024-04-29 15:58:41 +02:00
2024-07-04 15:46:11 +02:00
if tokt == TOKENIZER_TYPE . UGM :
files . append ( " spiece.model " )
2024-09-11 14:29:51 +02:00
if os . path . isdir ( repo ) :
# If repo is a path on the file system, copy the directory
for file in files :
src_path = os . path . join ( repo , file )
dst_path = f " models/tokenizers/ { name } / { file } "
if os . path . isfile ( dst_path ) :
logger . info ( f " { name } : File { dst_path } already exists - skipping " )
continue
if os . path . isfile ( src_path ) :
shutil . copy2 ( src_path , dst_path )
logger . info ( f " { name } : Copied { src_path } to { dst_path } " )
else :
logger . warning ( f " { name } : Source file { src_path } does not exist " )
else :
# If repo is a URL, download the files
for file in files :
save_path = f " models/tokenizers/ { name } / { file } "
if os . path . isfile ( save_path ) :
logger . info ( f " { name } : File { save_path } already exists - skipping " )
continue
download_file_with_auth ( f " { repo } /resolve/main/ { file } " , token , save_path )
2024-04-29 15:58:41 +02:00
2024-05-05 07:19:30 +02:00
2024-05-17 14:11:45 +02:00
for model in models :
try :
download_model ( model )
except Exception as e :
logger . error ( f " Failed to download model { model [ ' name ' ] } . Error: { e } " )
2024-04-29 15:58:41 +02:00
2024-07-05 06:53:33 +02:00
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
2024-04-29 15:58:41 +02:00
src_ifs = " "
for model in models :
name = model [ " name " ]
tokt = model [ " tokt " ]
2024-07-04 15:46:11 +02:00
if tokt == TOKENIZER_TYPE . SPM or tokt == TOKENIZER_TYPE . UGM :
2024-04-29 15:58:41 +02:00
continue
2024-05-11 10:18:35 +02:00
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os . path . exists ( f " models/tokenizers/ { name } " ) :
logger . warning ( f " Directory for tokenizer { name } not found. Skipping... " )
continue
2024-04-29 15:58:41 +02:00
# create the tokenizer
2024-05-11 10:18:35 +02:00
try :
2024-07-04 15:46:11 +02:00
if name == " t5 " :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " , use_fast = False )
else :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " )
2024-05-11 10:18:35 +02:00
except OSError as e :
2024-07-02 12:18:13 +02:00
logger . error ( f " Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
2024-05-11 10:18:35 +02:00
continue # Skip to the next model if the tokenizer can't be loaded
2024-04-29 15:58:41 +02:00
2024-07-22 15:44:53 +02:00
chktok = tokenizer . encode ( CHK_TXT )
2024-04-29 15:58:41 +02:00
chkhsh = sha256 ( str ( chktok ) . encode ( ) ) . hexdigest ( )
2024-05-03 21:36:41 +02:00
logger . info ( f " model: { name } " )
logger . info ( f " tokt: { tokt } " )
logger . info ( f " repo: { model [ ' repo ' ] } " )
logger . info ( f " chktok: { chktok } " )
logger . info ( f " chkhsh: { chkhsh } " )
2024-04-29 15:58:41 +02:00
# print the "pre_tokenizer" content from the tokenizer.json
2024-04-30 10:05:25 +02:00
with open ( f " models/tokenizers/ { name } /tokenizer.json " , " r " , encoding = " utf-8 " ) as f :
2024-04-29 15:58:41 +02:00
cfg = json . load ( f )
2024-05-08 11:47:07 +02:00
normalizer = cfg [ " normalizer " ]
logger . info ( " normalizer: " + json . dumps ( normalizer , indent = 4 ) )
2024-04-29 15:58:41 +02:00
pre_tokenizer = cfg [ " pre_tokenizer " ]
2024-05-03 21:36:41 +02:00
logger . info ( " pre_tokenizer: " + json . dumps ( pre_tokenizer , indent = 4 ) )
2024-05-10 16:53:04 +02:00
if " ignore_merges " in cfg [ " model " ] :
2024-07-02 12:18:13 +02:00
logger . info ( " ignore_merges: " + json . dumps ( cfg [ " model " ] [ " ignore_merges " ] , indent = 4 ) )
2024-04-29 15:58:41 +02:00
2024-05-03 21:36:41 +02:00
logger . info ( " " )
2024-04-29 15:58:41 +02:00
2024-07-02 12:18:13 +02:00
src_ifs + = f " if chkhsh == \" { chkhsh } \" : \n "
2024-04-29 15:58:41 +02:00
src_ifs + = f " # ref: { model [ ' repo ' ] } \n "
2024-07-02 12:18:13 +02:00
src_ifs + = f " res = \" { name } \" \n "
2024-04-29 15:58:41 +02:00
2024-05-03 21:36:41 +02:00
src_func = f """
def get_vocab_base_pre ( self , tokenizer ) - > str :
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
2024-07-22 15:44:53 +02:00
chktxt = { repr ( CHK_TXT ) }
2024-05-03 21:36:41 +02:00
chktok = tokenizer . encode ( chktxt )
chkhsh = sha256 ( str ( chktok ) . encode ( ) ) . hexdigest ( )
2024-05-04 07:32:32 +02:00
logger . debug ( f " chktok: {{ chktok }} " )
logger . debug ( f " chkhsh: {{ chkhsh }} " )
2024-05-03 21:36:41 +02:00
res = None
2024-07-05 06:53:33 +02:00
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
2024-05-03 21:36:41 +02:00
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{ src_ifs }
if res is None :
2024-05-04 07:32:32 +02:00
logger . warning ( " \\ n " )
logger . warning ( " ************************************************************************************** " )
logger . warning ( " ** WARNING: The BPE pre-tokenizer was not recognized! " )
logger . warning ( " ** There are 2 possible reasons for this: " )
2024-07-05 06:53:33 +02:00
logger . warning ( " ** - the model has not been added to convert_hf_to_gguf_update.py yet " )
2024-05-04 07:32:32 +02:00
logger . warning ( " ** - the pre-tokenization config has changed upstream " )
2024-07-05 06:53:33 +02:00
logger . warning ( " ** Check your model files and convert_hf_to_gguf_update.py and update them accordingly. " )
2024-05-04 07:32:32 +02:00
logger . warning ( " ** ref: https://github.com/ggerganov/llama.cpp/pull/6920 " )
logger . warning ( " ** " )
logger . warning ( f " ** chkhsh: {{ chkhsh }} " )
logger . warning ( " ************************************************************************************** " )
logger . warning ( " \\ n " )
2024-05-03 21:36:41 +02:00
raise NotImplementedError ( " BPE pre-tokenizer was not recognized - update get_vocab_base_pre() " )
2024-05-04 07:32:32 +02:00
logger . debug ( f " tokenizer.ggml.pre: {{ repr(res) }} " )
logger . debug ( f " chkhsh: {{ chkhsh }} " )
2024-05-03 21:36:41 +02:00
return res
"""
2024-07-05 06:53:33 +02:00
convert_py_pth = pathlib . Path ( " convert_hf_to_gguf.py " )
2024-06-20 21:59:59 +02:00
convert_py = convert_py_pth . read_text ( encoding = " utf-8 " )
2024-05-17 14:11:45 +02:00
convert_py = re . sub (
r " (# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre) " ,
lambda m : m . group ( 1 ) + src_func + m . group ( 3 ) ,
convert_py ,
flags = re . DOTALL | re . MULTILINE ,
)
2024-06-20 21:59:59 +02:00
convert_py_pth . write_text ( convert_py , encoding = " utf-8 " )
2024-05-03 21:36:41 +02:00
2024-07-05 06:53:33 +02:00
logger . info ( " +++ convert_hf_to_gguf.py was updated " )
2024-04-29 15:58:41 +02:00
# generate tests for each tokenizer model
tests = [
2024-05-04 07:32:32 +02:00
" ied 4 ½ months " ,
" Führer " ,
2024-04-29 15:58:41 +02:00
" " ,
" " ,
" " ,
" " ,
" \t " ,
" \n " ,
" \n \n " ,
" \n \n \n " ,
" \t \n " ,
" Hello world " ,
" Hello world " ,
" Hello World " ,
" Hello World " ,
" Hello World! " ,
" Hello, world! " ,
" Hello, world! " ,
" this is 🦙.cpp " ,
" w048 7tuijk dsdfhu " ,
" нещо на Български " ,
" កាន់តែពិសេសអាចខលចេញ " ,
" 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello \n Hello " ,
" ( " ,
" \n = " ,
" ' era " ,
" Hello, y ' all! How are you 😁 ?我想在apple工作1314151天~ " ,
2024-07-04 15:46:11 +02:00
" !!!!!! " ,
2024-04-29 15:58:41 +02:00
" 3 " ,
" 33 " ,
" 333 " ,
" 3333 " ,
" 33333 " ,
" 333333 " ,
" 3333333 " ,
" 33333333 " ,
" 333333333 " ,
2024-07-04 09:41:03 +02:00
" Cửa Việt " , # llama-bpe fails on this
" discards " ,
2024-07-22 15:44:53 +02:00
CHK_TXT ,
2024-04-29 15:58:41 +02:00
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line
for model in models :
name = model [ " name " ]
tokt = model [ " tokt " ]
2024-05-11 10:18:35 +02:00
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os . path . exists ( f " models/tokenizers/ { name } " ) :
logger . warning ( f " Directory for tokenizer { name } not found. Skipping... " )
continue
2024-04-29 15:58:41 +02:00
# create the tokenizer
2024-05-11 10:18:35 +02:00
try :
2024-07-04 15:46:11 +02:00
if name == " t5 " :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " , use_fast = False )
else :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " )
2024-05-11 10:18:35 +02:00
except OSError as e :
logger . error ( f " Failed to load tokenizer for model { name } . Error: { e } " )
continue # Skip this model and continue with the next one in the loop
2024-04-29 15:58:41 +02:00
2024-04-30 10:05:25 +02:00
with open ( f " models/ggml-vocab- { name } .gguf.inp " , " w " , encoding = " utf-8 " ) as f :
2024-04-29 15:58:41 +02:00
for text in tests :
f . write ( f " { text } " )
f . write ( " \n __ggml_vocab_test__ \n " )
with open ( f " models/ggml-vocab- { name } .gguf.out " , " w " ) as f :
for text in tests :
res = tokenizer . encode ( text , add_special_tokens = False )
for r in res :
f . write ( f " { r } " )
f . write ( " \n " )
2024-05-03 21:36:41 +02:00
logger . info ( f " Tests for { name } written in ./models/ggml-vocab- { name } .gguf.* " )
2024-04-29 15:58:41 +02:00
# generate commands for creating vocab files
2024-05-03 21:36:41 +02:00
logger . info ( " \n Run the following commands to generate the vocab files for testing: \n " )
2024-04-29 15:58:41 +02:00
for model in models :
name = model [ " name " ]
2024-07-05 06:53:33 +02:00
print ( f " python3 convert_hf_to_gguf.py models/tokenizers/ { name } / --outfile models/ggml-vocab- { name } .gguf --vocab-only " ) # noqa: NP100
2024-04-29 15:58:41 +02:00
2024-05-03 21:36:41 +02:00
logger . info ( " \n " )