From 4cd621c26de2095cd7c4464bdec5fe2e696ef3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Wed, 8 May 2024 06:43:23 -0400 Subject: [PATCH] convert : add BPE pre-tokenization for DBRX (#7132) * Add BPE pre-tokenization for DBRX. * Add vocab GGUFs. * Remove test. * Remove GGUFs. --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 3 +++ llama.cpp | 4 ++++ llama.h | 1 + 4 files changed, 9 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index a3fe67ee7..b51010983 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -68,6 +68,7 @@ models = [ {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, + {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, ] # make directory "models/tokenizers" if it doesn't exist diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index f65d9320e..8b89575d5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -317,6 +317,9 @@ class Model(ABC): if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-instruct + res = "dbrx" if res is None: logger.warning("\n") diff --git a/llama.cpp b/llama.cpp index 26e7a3391..331c9d472 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4394,6 +4394,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "olmo") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; + } else if ( + tokenizer_pre == "dbrx") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12200,6 +12203,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_TYPE_BPE: switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + case LLAMA_VOCAB_PRE_TYPE_DBRX: word_collection = unicode_regex_split(text, { // original regex from tokenizer.json //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", diff --git a/llama.h b/llama.h index 5761cacf4..388839f39 100644 --- a/llama.h +++ b/llama.h @@ -82,6 +82,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_REFACT = 8, LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, LLAMA_VOCAB_PRE_TYPE_OLMO = 10, + LLAMA_VOCAB_PRE_TYPE_DBRX = 11, }; // note: these values should be synchronized with ggml_rope