From c8cdb48d105abc4486d9a778127b73d282ec8289 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 30 Jun 2024 23:13:48 -0400 Subject: [PATCH] llama : support all OpenELM models * llama : add variable GQA and variable FFN sizes Some metadata keys can now also be arrays to support setting their value per-layer for models like OpenELM. --- convert-hf-to-gguf.py | 161 +++++++++++++-------- gguf-py/gguf/constants.py | 3 +- gguf-py/gguf/gguf_writer.py | 21 ++- gguf-py/gguf/tensor_mapping.py | 1 - src/llama.cpp | 249 +++++++++++++++++---------------- 5 files changed, 247 insertions(+), 188 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 355737376..1549380b4 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -13,7 +13,7 @@ import sys from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast import math import numpy as np @@ -669,6 +669,51 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + vocab_reader = gguf.GGUFReader(tokenizer_path, "r") + + default_pre = "mpt" if model_name == "gpt-neox" else "default" + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) + assert field # tokenizer model + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field # token list + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + if model_name == "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) + assert field # token scores + self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field # token types + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + if model_name != "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field # token merges + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: + self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: + self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) + @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -2410,39 +2455,7 @@ class MambaModel(Model): self._set_vocab_sentencepiece() else: # Use the GPT-NeoX tokenizer when no tokenizer files are present - tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - neox_reader = gguf.GGUFReader(tokenizer_path, "r") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) + self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) @@ -2598,45 +2611,77 @@ class JinaBertV2Model(BertModel): class OpenELMModel(Model): model_arch = gguf.MODEL_ARCH.OPENELM - # Copied from LlamaModel + @staticmethod + def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] + ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] + self._n_embd: int = self.hparams["model_dim"] + self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] + self._num_query_heads: list[int] = self.hparams["num_query_heads"] + self._ffn_dims: list[int] = [ + OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) + for multiplier in ffn_multipliers + ] + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + + # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): try: - self. _set_vocab_sentencepiece() + self._set_vocab_sentencepiece() except FileNotFoundError: - self._set_vocab_llama_hf() + self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) def set_gguf_parameters(self): - # TODO: Look closer at these + n_embd = self._n_embd + head_dim = self.hparams["head_dim"] + rot_pct = 1.0 + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_query_heads) + assert self.block_count == len(self._ffn_dims) - self.gguf_writer.add_name("OpenELM") - self.block_count = self.find_hparam(["num_transformer_layers"]) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_context_length"]) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_head_count(self._num_query_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 self.gguf_writer.add_layer_norm_rms_eps(1e-6) - n_embd = self.find_hparam(["model_dim"]) - self.gguf_writer.add_embedding_length(n_embd) - head_dim = self.find_hparam(["head_dim"]) - n_head = n_embd // head_dim - rot_pct = 1.0 - self.gguf_writer.add_context_length(self.find_hparam(["max_context_length"])) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count_kv(n_head*10) - self.gguf_writer.add_head_count(n_head*10) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_feed_forward_length(0) # dynamically calculated def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - # TODO: Read configuration! if "n_layers" in keys: - return 16 # num_transformer_layers - if "hidden_size" in keys: - return 1280 # model_dim - if "num_attention_heads" in keys: - return 64 # head_dim + return self.hparams["num_transformer_layers"] return super().find_hparam(keys, optional) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # split ff + if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": + ff_dim = self._ffn_dims[bid] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + return + + yield (self.map_tensor_name(name), data_torch) + @Model.register("ArcticForCausalLM") class ArcticModel(Model): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a0c38e8fa..b2a186712 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -869,8 +869,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_K_NORM, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, ], MODEL_ARCH.ARCTIC: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 1aeb0d9b0..9086879de 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -480,8 +480,11 @@ class GGUFWriter: def add_leading_dense_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length) - def add_feed_forward_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) + def add_feed_forward_length(self, length: int | Sequence[int]) -> None: + if isinstance(length, int): + self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) + else: + self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) def add_expert_feed_forward_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length) @@ -495,11 +498,17 @@ class GGUFWriter: def add_decoder_start_token_id(self, id: int) -> None: self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id) - def add_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) + def add_head_count(self, count: int | Sequence[int]) -> None: + if isinstance(count, int): + self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) + else: + self.add_array(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) - def add_head_count_kv(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) + def add_head_count_kv(self, count: int | Sequence[int]) -> None: + if isinstance(count, int): + self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) + else: + self.add_array(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) def add_key_length(self, length: int) -> None: self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ce8bcbe19..71159d157 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -267,7 +267,6 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc11", # nomic-bert "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 - "transformer.layers.{bid}.ffn.proj_1", # openelm "model.layers.{bid}.residual_mlp.w3", # arctic ), diff --git a/src/llama.cpp b/src/llama.cpp index 176ad1495..bce9dfb07 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1143,8 +1143,9 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, { @@ -2111,6 +2112,11 @@ struct llama_hparams { uint32_t n_expert_used = 0; uint32_t n_vocab_type = 0; // for BERT-style token types + // TODO: find a more compact way to add more per-layer hyper-parameters + std::vector n_head_vec; + std::vector n_head_kv_vec; + std::vector n_ff_vec; + uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; @@ -2164,6 +2170,10 @@ struct llama_hparams { if (this->n_expert != other.n_expert) return true; if (this->n_expert_used != other.n_expert_used) return true; + if (this->n_head_vec != other.n_head_vec) return true; + if (this->n_head_kv_vec != other.n_head_kv_vec) return true; + if (this->n_ff_vec != other.n_ff_vec) return true; + if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; if (this->n_lora_q != other.n_lora_q) return true; if (this->n_lora_kv != other.n_lora_kv) return true; @@ -2192,18 +2202,53 @@ struct llama_hparams { return false; } - uint32_t n_gqa() const { + // TODO: deduplicate per-layer getters + uint32_t n_head_l(uint32_t layer) const { + if (layer < n_head_vec.size()) { + int32_t n_h_l = n_head_vec[layer]; + // TODO: what should happen when it's negative? + GGML_ASSERT(n_h_l >= 0); + return n_h_l; + } + return n_head; + } + + uint32_t n_head_kv_l(uint32_t layer) const { + if (layer < n_head_kv_vec.size()) { + int32_t n_hkv_l = n_head_kv_vec[layer]; + // TODO: what should happen when it's negative? + GGML_ASSERT(n_hkv_l >= 0); + return n_hkv_l; + } + return n_head_kv; + } + + uint32_t n_ff_l(uint32_t layer) const { + if (layer < n_ff_vec.size()) { + int32_t n_f_l = n_ff_vec[layer]; + // TODO: what should happen when it's negative? + GGML_ASSERT(n_f_l >= 0); + return n_f_l; + } + return n_ff; + } + + uint32_t n_gqa(uint32_t layer = 0) const { + uint32_t n_head_kv = n_head_kv_l(layer); + uint32_t n_head = n_head_l(layer); if (n_head_kv == 0) { return 0; } return n_head/n_head_kv; } - uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads + uint32_t n_embd_k_gqa(uint32_t layer = 0) const { // dimension of key embeddings across all k-v heads + uint32_t n_head_kv = n_head_kv_l(layer); return n_embd_head_k * n_head_kv; } - uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads + uint32_t n_embd_v_gqa(uint32_t layer = 0) const { // dimension of value embeddings across all k-v heads + uint32_t n_head_kv = n_head_kv_l(layer); return n_embd_head_v * n_head_kv; } @@ -2807,8 +2852,6 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams = model.hparams; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); const int64_t n_layer = hparams.n_layer; cache.has_shift = false; @@ -2817,13 +2860,6 @@ static bool llama_kv_cache_init( cache.recurrent = model.arch == LLM_ARCH_MAMBA; cache.v_trans = !cparams.flash_attn; - // TODO: support mixed recurrent Transformer architectures - // NOTE: (!a || b) is a logical implication (a -> b) - GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s()); - GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s()); - GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa()); - GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa()); - cache.head = 0; cache.size = kv_size; cache.used = 0; @@ -2873,6 +2909,9 @@ static bool llama_kv_cache_init( cache.v_l.reserve(n_layer); for (int i = 0; i < (int) n_layer; i++) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); @@ -3748,9 +3787,9 @@ struct llama_model_loader { bool get_arr(const std::string & key, std::vector & result, const bool required = true) { const int kid = gguf_find_key(meta, key.c_str()); - if (kid < 0) { + if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) { if (required) { - throw std::runtime_error(format("key not found in model: %s", key.c_str())); + throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } return false; } @@ -3758,13 +3797,14 @@ struct llama_model_loader { struct GGUFMeta::ArrayInfo arr_info = GGUFMeta::GKV::get_kv(meta, kid); - if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) { - throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str())); - } - + // TODO: allow ANY lossless cast // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T)); - GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same::value)); - GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same::value)); + switch (arr_info.gt) { + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value)); break; + default: + throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); + } result.resize(arr_info.length); result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); @@ -4378,8 +4418,6 @@ static void llm_load_hparams( ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); - ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -4392,9 +4430,22 @@ static void llm_load_hparams( GGML_ASSERT(hparams.n_expert_used == 0); } + // per-layer or global values + if (!ml.get_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_vec, false)) { + ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + } + if (!ml.get_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_vec, false)) { + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + } else { + hparams.n_head = hparams.n_head_vec[0]; + } + // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; - ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false); + + if (!ml.get_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_vec, false)) { + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false); + } bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -5530,24 +5581,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } } -static int make_divisible( - double v, - int divisor = 8, - double min_value = 0 -) { - if (min_value == 0) { - min_value = divisor; - } - int new_v = int(v + divisor / 2.0) / divisor * divisor; - if (new_v < min_value) { - new_v = min_value; - } - if (new_v < 0.9 * v) { - new_v += divisor; - } - return new_v; -} - // Returns false if cancelled by progress_callback static bool llm_load_tensors( llama_model_loader & ml, @@ -6830,44 +6863,37 @@ static bool llm_load_tensors( } break; case LLM_ARCH_OPENELM: { - std::vector num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5}; - std::vector num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20}; - std::vector ffn_multipliers = {0.5, 0.73, 0.97, 1.2, 1.43, 1.67, 1.9, 2.13, 2.37, 2.6, 2.83, 3.07, 3.3, 3.53, 3.77, 4.0}; - - llama_hparams modified_hparams(hparams); - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + // init output from the input tok embed + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } for (int i = 0; i < n_layer; ++i) { - const int64_t n_head_k = num_kv_heads[i]; - const int64_t n_head_v = num_kv_heads[i]; - const int64_t n_head_kv = n_head_k + n_head_v; - const int64_t n_head = n_head_kv + num_query_heads[i]; - modified_hparams.n_head = n_head; - modified_hparams.n_embd_head_v = 64; - modified_hparams.n_embd_head_k = 64; - int64_t n_embd_head = modified_hparams.n_embd_head_v; + const int64_t n_head = hparams.n_head_l(i); + const int64_t n_head_qkv = 2*hparams.n_head_kv_l(i) + n_head; + const int64_t n_embd_head = hparams.n_embd_head_k; - modified_hparams.n_head_kv = n_head_kv; - const int64_t ffn_inter = make_divisible(n_embd*ffn_multipliers[i], 256); + const int64_t n_ff = hparams.n_ff_l(i); ggml_context* ctx_layer = ctx_for_layer(i); ggml_context* ctx_split = ctx_for_layer_split(i); - auto& layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd_head*n_head }); - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head }); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head_kv*n_embd_head*2, n_embd }); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * ffn_inter }); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_inter, n_embd }); + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head}); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_GPTNEOX: @@ -7325,8 +7351,8 @@ static void llm_build_kv_store( int64_t il) { const int64_t n_ctx = cparams.n_ctx; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); GGML_ASSERT(kv.size == n_ctx); @@ -7619,12 +7645,12 @@ static struct ggml_tensor * llm_build_kqv( const llm_build_cb & cb, int il) { const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_head = hparams.n_head_l(il); + const int64_t n_head_kv = hparams.n_head_kv_l(il); const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); @@ -7899,6 +7925,8 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv_l(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions @@ -7958,6 +7986,9 @@ struct llm_build_context { } for (int il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, nm, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), @@ -11779,12 +11810,8 @@ struct llm_build_context { struct ggml_cgraph * build_openelm() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - const int64_t n_embd_head = 64; - // TODO: get this from config - std::vector num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5}; - std::vector num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20}; - - llama_hparams modified_hparams(hparams); + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -11795,23 +11822,17 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_q = num_query_heads[il]; - const int64_t n_head_k = num_kv_heads[il]; - const int64_t n_head_v = num_kv_heads[il]; - const int64_t n_head_kv = n_head_k + n_head_v; - const int64_t n_head = n_head_kv + n_head_q; - modified_hparams.n_head = n_head_q; - modified_hparams.n_head_kv = n_head_k; - modified_hparams.n_embd_head_k = 64; - modified_hparams.n_embd_head_v = 64; + for (int il = 0; il < n_layer; ++il) { + const int64_t n_head = hparams.n_head_l(il); + const int64_t n_head_kv = hparams.n_head_kv_l(il); + const int64_t n_head_qkv = 2*n_head_kv + n_head; cur = inpL; struct ggml_tensor * residual = cur; // norm - cur = llm_build_norm(ctx0, inpL, modified_hparams, + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); @@ -11821,23 +11842,23 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_head, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_q, n_tokens, cur->nb[1], cur->nb[2], 0)); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_k, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_q)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_v, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_q+n_head_k))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, modified_hparams, + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, modified_hparams, + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); cb(Kcur, "Kcur", il); @@ -11854,10 +11875,10 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_v, n_tokens); + Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, modified_hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11865,6 +11886,7 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids); } @@ -11873,31 +11895,14 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, modified_hparams, + cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - // TODO: Split ffn_up during conversion? - struct ggml_tensor * ffn_gate = - ggml_view_2d(ctx0, - model.layers[il].ffn_up, - model.layers[il].ffn_down->ne[1], - model.layers[il].ffn_down->ne[0], - model.layers[il].ffn_up->nb[1], - 0); - - struct ggml_tensor * ffn_up = - ggml_view_2d(ctx0, - model.layers[il].ffn_up, - model.layers[il].ffn_down->ne[1], - model.layers[il].ffn_down->ne[0], - model.layers[il].ffn_up->nb[1], - model.layers[il].ffn_up->nb[1] * model.layers[il].ffn_down->ne[0]); - cur = llm_build_ffn(ctx0, cur, - ffn_up, NULL, NULL, - ffn_gate, NULL, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il);