diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b36b5193c..dc70e26d5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2728,6 +2728,8 @@ class ChatGLMModel(Model): tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) assert max(tokenizer.get_vocab().values()) < vocab_size + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens print(vocab_size) print(max(tokenizer.get_vocab().values())) for token_id in range(vocab_size): @@ -2750,7 +2752,12 @@ class ChatGLMModel(Model): text = f"[PAD{token_id}]".encode("utf-8") if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): - toktype = SentencePieceTokenTypes.UNKNOWN + if piece in special_tokens: + # show special tokens in prompt + toktype = SentencePieceTokenTypes.USER_DEFINED + else: + print(f"unknow token: {piece}") + toktype = SentencePieceTokenTypes.UNKNOWN tokens.append(text) scores.append(score) toktypes.append(toktype) @@ -2856,9 +2863,9 @@ class ChatGLMModel(Model): special_vocab.chat_template = "ChatGLM4" special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + # if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # this one is usually not in config.json anyway special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) @@ -2955,7 +2962,7 @@ def parse_args() -> argparse.Namespace: help="model is executed on big endian machine", ) parser.add_argument( - "model", type=Path, + "--model", type=Path, help="directory containing model file", ) parser.add_argument( diff --git a/llama.cpp b/llama.cpp index a0255bac8..9e23f6643 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1802,9 +1802,11 @@ enum e_model { MODEL_2_8B, MODEL_3B, MODEL_4B, + MODEL_6B, MODEL_6_9B, MODEL_7B, MODEL_8B, + MODEL_9B, MODEL_12B, MODEL_13B, MODEL_14B, @@ -3918,9 +3920,11 @@ static const char * llama_model_type_name(e_model type) { case MODEL_2_8B: return "2.8B"; case MODEL_3B: return "3B"; case MODEL_4B: return "4B"; + case MODEL_6B: return "6B"; case MODEL_6_9B: return "6.9B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; + case MODEL_9B: return "9B"; case MODEL_12B: return "12B"; case MODEL_13B: return "13B"; case MODEL_14B: return "14B"; @@ -4507,8 +4511,8 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { - case 28: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_8B; break; + case 28: model.type = e_model::MODEL_6B; break; + case 40: model.type = e_model::MODEL_9B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -18362,6 +18366,19 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to } bool llama_token_is_eog(const struct llama_model * model, llama_token token) { + auto arch_name = llama_model_arch_name(model->arch); + auto vocab_type = model->vocab.type; + if (strcmp(arch_name, "chatglm") == 0) { + if (LLAMA_VOCAB_TYPE_BPE == vocab_type) { // glm4 + return token != -1 && ( + token == llama_token_eos(model) || + token == llama_token_eot(model) || + token == 151329 || + token == 151336 || + token == 151338 + ); + } + } return token != -1 && ( token == llama_token_eos(model) || token == llama_token_eot(model) @@ -18424,8 +18441,18 @@ int32_t llama_tokenize( int32_t n_tokens_max, bool add_special, bool parse_special) { - auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special); - + auto arch_name = llama_model_arch_name(model->arch); + auto prompt = std::move(std::string(text, text_len)); + auto vocab_type = model->vocab.type; + if (strcmp(arch_name, "chatglm") == 0) { + // chatglm3 + if (LLAMA_VOCAB_TYPE_SPM == vocab_type) { + prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"; + } else if (LLAMA_VOCAB_TYPE_BPE == vocab_type) { // glm4 + prompt = "[gMASK]<|user|>\n" + prompt + "<|assistant|>"; + } + } + auto res = llama_tokenize_internal(model->vocab, prompt, add_special, parse_special); if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size());