From c3f8d583560b4f261fa21c976793e538c60cd66c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 May 2024 19:53:48 +0300 Subject: [PATCH] tests : test-tokenizer-0.sh print more info (#7402) --- convert-hf-to-gguf-update.py | 2 +- convert-hf-to-gguf.py | 2 +- tests/test-tokenizer-0.sh | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 45404b32b..1923b88ba 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -72,7 +72,7 @@ models = [ {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - {"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, + {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1acf45bf2..6357d4034 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -447,7 +447,7 @@ class Model: # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-1_6b + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b res = "stablelm2" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh index 2fb8632d8..1fec8bbf1 100755 --- a/tests/test-tokenizer-0.sh +++ b/tests/test-tokenizer-0.sh @@ -17,10 +17,15 @@ make -j tests/test-tokenizer-0 printf "Testing %s on %s ...\n" $name $input -python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 -cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" +set -e +printf "Tokenizing using (py) Python AutoTokenizer ...\n" +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 + +printf "Tokenizing using (cpp) llama.cpp ...\n" ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 + +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" diff $input.tok $input.tokcpp > /dev/null 2>&1