diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 550dd5cfd..746ff2ba5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2224,9 +2224,8 @@ class InternLM2Model(Model): def set_vocab(self): # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. + # Copy from _set_vocab_sentencepiece, The only difference is that we find mislabeled UNUSED tokens, + # and that we set '<|im_end|>' as the eos token for chat models. from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -2253,11 +2252,6 @@ class InternLM2Model(Model): piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") toktype = SentencePieceTokenTypes.NORMAL if tokenizer.IsUnknown(token_id): diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8ca9f8915..bf61e4d09 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -561,7 +561,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam vocab->id_to_token.resize(n_vocab); for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); + std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i)); vocab->token_to_id[word] = i; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 3176d6e26..22f7fb84a 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -12,7 +12,7 @@ static bool g_verbose = false; static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id)); } static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 54aa822c9..2ef989adb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -225,7 +225,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { switch (type) { case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); + return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i)); case GGUF_TYPE_ARRAY: { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); @@ -235,7 +235,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { ss << "["; for (int j = 0; j < arr_n; j++) { if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); + std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j)); // escape quotes replace_all(val, "\\", "\\\\"); replace_all(val, "\"", "\\\""); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 15602a96d..6eff98b9c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2313,10 +2313,12 @@ extern "C" { GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); + GGML_API int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id); GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + GGML_API int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 38990e3a0..9f9510891 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -21335,6 +21335,14 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i return str->data; } +int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->n; +} + int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); @@ -21413,6 +21421,12 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { return ctx->kv[key_id].value.str.data; } +int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); + return ctx->kv[key_id].value.str.n; +} + const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY); diff --git a/src/llama.cpp b/src/llama.cpp index aaf8db496..ca8c2963a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1406,7 +1406,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { switch (type) { case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); + return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i)); case GGUF_TYPE_ARRAY: { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); @@ -1416,7 +1416,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { ss << "["; for (int j = 0; j < arr_n; j++) { if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); + std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j)); // escape quotes replace_all(val, "\\", "\\\\"); replace_all(val, "\"", "\\\""); @@ -3436,7 +3436,7 @@ namespace GGUFMeta { static constexpr gguf_type gt = GGUF_TYPE_STRING; static std::string getter(const gguf_context * ctx, const int kid) { - return gguf_get_val_str(ctx, kid); + return std::string(gguf_get_val_str(ctx, kid), gguf_get_val_str_n(ctx, kid)); } }; @@ -5316,7 +5316,7 @@ static void llm_load_vocab( const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); for (int i = 0; i < n_merges; i++) { - const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + const std::string word(gguf_get_arr_str(ctx, merges_keyidx, i), gguf_get_arr_str_n(ctx, merges_keyidx, i)); GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); std::string first; @@ -5521,7 +5521,7 @@ static void llm_load_vocab( vocab.id_to_token.resize(n_vocab); for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); + std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i)); GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); vocab.token_to_id[word] = i; @@ -16207,7 +16207,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c { auto get_kv_str = [&](const std::string & key) -> std::string { int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id)); }; auto get_kv_f32 = [&](const std::string & key) -> float { int id = gguf_find_key(ctx_gguf, key.c_str());