From bb43cf7e9d86d69ffd9c7f008f75db890a35b45a Mon Sep 17 00:00:00 2001 From: bryanSwk <93190252+bryanSwk@users.noreply.github.com> Date: Thu, 4 Apr 2024 02:05:10 +0800 Subject: [PATCH] llama : add SEA-LION support (#6448) * initial commit for sealion support * add sealion support * minor fix * q/k ln and pos_embd only if required * Apply suggestions from code review Co-authored-by: Georgi Gerganov * minor : clear whitespaces --------- Co-authored-by: bryan Co-authored-by: Georgi Gerganov --- README.md | 1 + convert-hf-to-gguf.py | 15 ++++++++++- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 2 ++ llama.cpp | 48 +++++++++++++++++++++++++++++++--- 5 files changed, 65 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6f1f0cef2..67692dee1 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Mamba](https://github.com/state-spaces/mamba) - [x] [Xverse](https://huggingface.co/models?search=xverse) - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) +- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) **Multimodal models:** diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bca1c2d79..c34bccaac 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -510,6 +510,16 @@ class BloomModel(Model): class MPTModel(Model): model_arch = gguf.MODEL_ARCH.MPT + def set_vocab(self): + try: + self._set_vocab_gpt2() + except: + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + def set_gguf_parameters(self): block_count = self.hparams["n_layers"] self.gguf_writer.add_name(self.dir_model.name) @@ -523,7 +533,10 @@ class MPTModel(Model): self.gguf_writer.add_layer_norm_eps(1e-5) if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f468802d1..5214764a9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -367,6 +367,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_ACT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.POS_EMBD, ], MODEL_ARCH.GPTJ: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 93a5a455e..345b1b0c7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -285,11 +285,13 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "transformer.blocks.{bid}.attn.q_ln", # sea-lion ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "transformer.blocks.{bid}.attn.k_ln", # sea-lion ), MODEL_TENSOR.ROPE_FREQS: ( diff --git a/llama.cpp b/llama.cpp index 08ec73328..267ac4cc0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -594,6 +594,9 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, }, }, { @@ -4867,6 +4870,7 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false); // output { @@ -4905,6 +4909,12 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false); + // AWQ ScaleActivation layer layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } @@ -7721,6 +7731,7 @@ struct llm_build_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; + struct ggml_tensor * pos; struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); @@ -7731,6 +7742,16 @@ struct llm_build_context { // positions of the tokens in the KV cache struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); + if (model.pos_embd) { + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + } + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -7765,11 +7786,32 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Q/K Layernorm + if (model.layers[il].attn_q_norm) { + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } else { + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } } if (il == n_layer - 1) {