2023-11-11 06:04:50 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
from enum import Enum, IntEnum, auto
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
#
|
|
|
|
# constants
|
|
|
|
#
|
|
|
|
|
|
|
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
|
|
|
GGUF_VERSION = 3
|
|
|
|
GGUF_DEFAULT_ALIGNMENT = 32
|
2024-05-11 17:06:26 +02:00
|
|
|
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
#
|
|
|
|
# metadata keys
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
class Keys:
|
|
|
|
class General:
|
2024-07-18 12:40:15 +02:00
|
|
|
TYPE = "general.type"
|
|
|
|
ARCHITECTURE = "general.architecture"
|
|
|
|
QUANTIZATION_VERSION = "general.quantization_version"
|
|
|
|
ALIGNMENT = "general.alignment"
|
|
|
|
FILE_TYPE = "general.file_type"
|
|
|
|
|
|
|
|
# Authorship Metadata
|
|
|
|
NAME = "general.name"
|
|
|
|
AUTHOR = "general.author"
|
|
|
|
VERSION = "general.version"
|
|
|
|
ORGANIZATION = "general.organization"
|
|
|
|
|
|
|
|
FINETUNE = "general.finetune"
|
|
|
|
BASENAME = "general.basename"
|
|
|
|
|
|
|
|
DESCRIPTION = "general.description"
|
|
|
|
QUANTIZED_BY = "general.quantized_by"
|
|
|
|
|
|
|
|
SIZE_LABEL = "general.size_label"
|
|
|
|
|
|
|
|
# Licensing details
|
|
|
|
LICENSE = "general.license"
|
|
|
|
LICENSE_NAME = "general.license.name"
|
|
|
|
LICENSE_LINK = "general.license.link"
|
|
|
|
|
|
|
|
# Typically represents the converted GGUF repo (Unless native)
|
|
|
|
URL = "general.url" # Model Website/Paper
|
|
|
|
DOI = "general.doi"
|
|
|
|
UUID = "general.uuid"
|
|
|
|
REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
|
|
|
# Model Source during conversion
|
|
|
|
SOURCE_URL = "general.source.url" # Model Website/Paper
|
|
|
|
SOURCE_DOI = "general.source.doi"
|
|
|
|
SOURCE_UUID = "general.source.uuid"
|
|
|
|
SOURCE_REPO_URL = "general.source.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
|
|
|
# Base Model Source. There can be more than one source if it's a merged
|
|
|
|
# model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
|
|
|
|
# tracing linage of models as it is finetuned or merged over time.
|
|
|
|
BASE_MODEL_COUNT = "general.base_model.count"
|
|
|
|
BASE_MODEL_NAME = "general.base_model.{id}.name"
|
|
|
|
BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
|
|
|
|
BASE_MODEL_VERSION = "general.base_model.{id}.version"
|
|
|
|
BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
|
|
|
|
BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
|
|
|
|
BASE_MODEL_DOI = "general.base_model.{id}.doi"
|
|
|
|
BASE_MODEL_UUID = "general.base_model.{id}.uuid"
|
|
|
|
BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
|
|
|
# Array based KV stores
|
|
|
|
TAGS = "general.tags"
|
|
|
|
LANGUAGES = "general.languages"
|
|
|
|
DATASETS = "general.datasets"
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
class LLM:
|
2024-06-17 21:08:46 +02:00
|
|
|
VOCAB_SIZE = "{arch}.vocab_size"
|
|
|
|
CONTEXT_LENGTH = "{arch}.context_length"
|
|
|
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
|
|
|
BLOCK_COUNT = "{arch}.block_count"
|
|
|
|
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
|
|
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
|
|
|
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
|
|
|
EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
|
|
|
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
|
|
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
|
|
|
EXPERT_COUNT = "{arch}.expert_count"
|
|
|
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
|
|
|
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
|
|
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
|
|
|
POOLING_TYPE = "{arch}.pooling_type"
|
|
|
|
LOGIT_SCALE = "{arch}.logit_scale"
|
2024-06-24 07:06:05 +02:00
|
|
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
2024-06-30 05:44:08 +02:00
|
|
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
|
|
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
2024-09-01 16:38:17 +02:00
|
|
|
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
|
|
|
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
|
|
|
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
class Attention:
|
|
|
|
HEAD_COUNT = "{arch}.attention.head_count"
|
|
|
|
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
|
|
|
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
|
|
|
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
2024-01-02 12:51:28 +01:00
|
|
|
KEY_LENGTH = "{arch}.attention.key_length"
|
|
|
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
2023-11-11 06:04:50 +01:00
|
|
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
|
|
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
2024-02-11 17:21:38 +01:00
|
|
|
CAUSAL = "{arch}.attention.causal"
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
|
|
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
2024-06-24 07:06:05 +02:00
|
|
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
2024-07-01 18:48:34 +02:00
|
|
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
class Rope:
|
2024-05-21 22:28:32 +02:00
|
|
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
|
|
|
FREQ_BASE = "{arch}.rope.freq_base"
|
|
|
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
|
|
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
|
|
|
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
|
|
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
|
|
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
2023-11-11 06:04:50 +01:00
|
|
|
|
2024-06-24 11:42:03 +02:00
|
|
|
class Split:
|
|
|
|
LLM_KV_SPLIT_NO = "split.no"
|
|
|
|
LLM_KV_SPLIT_COUNT = "split.count"
|
|
|
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
|
|
|
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
class SSM:
|
|
|
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
|
|
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
|
|
|
STATE_SIZE = "{arch}.ssm.state_size"
|
|
|
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
2024-08-21 10:06:36 +02:00
|
|
|
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
|
2024-09-01 16:38:17 +02:00
|
|
|
class WKV:
|
|
|
|
HEAD_SIZE = "{arch}.wkv.head_size"
|
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
class Tokenizer:
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL = "tokenizer.ggml.model"
|
|
|
|
PRE = "tokenizer.ggml.pre"
|
|
|
|
LIST = "tokenizer.ggml.tokens"
|
|
|
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
|
|
|
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
|
|
|
SCORES = "tokenizer.ggml.scores"
|
|
|
|
MERGES = "tokenizer.ggml.merges"
|
|
|
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
|
|
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
|
|
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
|
|
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
|
|
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
|
|
|
CLS_ID = "tokenizer.ggml.cls_token_id"
|
|
|
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
|
|
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
|
|
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
|
|
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
|
|
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
|
|
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
|
|
|
HF_JSON = "tokenizer.huggingface.json"
|
|
|
|
RWKV = "tokenizer.rwkv.world"
|
|
|
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
|
|
|
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
|
|
|
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
2024-04-16 08:13:13 +02:00
|
|
|
# FIM/Infill special tokens constants
|
2024-06-24 07:06:05 +02:00
|
|
|
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
|
|
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
|
|
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
|
|
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
2024-08-05 09:38:01 +02:00
|
|
|
EOM_ID = "tokenizer.ggml.eom_token_id"
|
2023-11-11 06:04:50 +01:00
|
|
|
|
2024-07-15 20:50:47 +02:00
|
|
|
class Adapter:
|
|
|
|
TYPE = "adapter.type"
|
|
|
|
LORA_ALPHA = "adapter.lora.alpha"
|
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
#
|
|
|
|
# recommended mapping of model tensor names for storage in gguf
|
|
|
|
#
|
|
|
|
|
|
|
|
|
2024-07-15 20:50:47 +02:00
|
|
|
class GGUFType:
|
|
|
|
MODEL = "model"
|
|
|
|
ADAPTER = "adapter"
|
|
|
|
|
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
class MODEL_ARCH(IntEnum):
|
2024-06-24 07:06:05 +02:00
|
|
|
LLAMA = auto()
|
|
|
|
FALCON = auto()
|
|
|
|
BAICHUAN = auto()
|
|
|
|
GROK = auto()
|
|
|
|
GPT2 = auto()
|
|
|
|
GPTJ = auto()
|
|
|
|
GPTNEOX = auto()
|
|
|
|
MPT = auto()
|
|
|
|
STARCODER = auto()
|
|
|
|
REFACT = auto()
|
|
|
|
BERT = auto()
|
|
|
|
NOMIC_BERT = auto()
|
2024-05-11 09:46:09 +02:00
|
|
|
JINA_BERT_V2 = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
BLOOM = auto()
|
|
|
|
STABLELM = auto()
|
|
|
|
QWEN = auto()
|
|
|
|
QWEN2 = auto()
|
|
|
|
QWEN2MOE = auto()
|
|
|
|
PHI2 = auto()
|
|
|
|
PHI3 = auto()
|
|
|
|
PLAMO = auto()
|
|
|
|
CODESHELL = auto()
|
|
|
|
ORION = auto()
|
|
|
|
INTERNLM2 = auto()
|
|
|
|
MINICPM = auto()
|
|
|
|
GEMMA = auto()
|
2024-06-28 06:00:43 +02:00
|
|
|
GEMMA2 = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
STARCODER2 = auto()
|
2024-09-01 16:38:17 +02:00
|
|
|
RWKV6 = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
MAMBA = auto()
|
|
|
|
XVERSE = auto()
|
|
|
|
COMMAND_R = auto()
|
|
|
|
DBRX = auto()
|
|
|
|
OLMO = auto()
|
2024-07-04 19:14:21 +02:00
|
|
|
OPENELM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ARCTIC = auto()
|
|
|
|
DEEPSEEK2 = auto()
|
2024-07-07 14:52:10 +02:00
|
|
|
CHATGLM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
BITNET = auto()
|
|
|
|
T5 = auto()
|
2024-08-10 11:43:26 +02:00
|
|
|
T5ENCODER = auto()
|
2024-07-02 16:36:00 +02:00
|
|
|
JAIS = auto()
|
2024-08-16 04:23:33 +02:00
|
|
|
NEMOTRON = auto()
|
2024-08-16 08:35:18 +02:00
|
|
|
EXAONE = auto()
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
class MODEL_TENSOR(IntEnum):
|
2024-06-24 07:06:05 +02:00
|
|
|
TOKEN_EMBD = auto()
|
|
|
|
TOKEN_EMBD_NORM = auto()
|
|
|
|
TOKEN_TYPES = auto()
|
|
|
|
POS_EMBD = auto()
|
|
|
|
OUTPUT = auto()
|
|
|
|
OUTPUT_NORM = auto()
|
|
|
|
ROPE_FREQS = auto()
|
|
|
|
ROPE_FACTORS_LONG = auto()
|
|
|
|
ROPE_FACTORS_SHORT = auto()
|
|
|
|
ATTN_Q = auto()
|
|
|
|
ATTN_K = auto()
|
|
|
|
ATTN_V = auto()
|
|
|
|
ATTN_QKV = auto()
|
|
|
|
ATTN_OUT = auto()
|
|
|
|
ATTN_NORM = auto()
|
|
|
|
ATTN_NORM_2 = auto()
|
|
|
|
ATTN_OUT_NORM = auto()
|
2024-06-28 06:00:43 +02:00
|
|
|
ATTN_POST_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_ROT_EMBD = auto()
|
|
|
|
FFN_GATE_INP = auto()
|
|
|
|
FFN_GATE_INP_SHEXP = auto()
|
|
|
|
FFN_NORM = auto()
|
2024-06-28 06:00:43 +02:00
|
|
|
FFN_PRE_NORM = auto()
|
|
|
|
FFN_POST_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
FFN_GATE = auto()
|
|
|
|
FFN_DOWN = auto()
|
|
|
|
FFN_UP = auto()
|
|
|
|
FFN_ACT = auto()
|
|
|
|
FFN_NORM_EXP = auto()
|
|
|
|
FFN_GATE_EXP = auto()
|
|
|
|
FFN_DOWN_EXP = auto()
|
|
|
|
FFN_UP_EXP = auto()
|
|
|
|
FFN_GATE_SHEXP = auto()
|
|
|
|
FFN_DOWN_SHEXP = auto()
|
|
|
|
FFN_UP_SHEXP = auto()
|
|
|
|
ATTN_Q_NORM = auto()
|
|
|
|
ATTN_K_NORM = auto()
|
|
|
|
LAYER_OUT_NORM = auto()
|
|
|
|
SSM_IN = auto()
|
|
|
|
SSM_CONV1D = auto()
|
|
|
|
SSM_X = auto()
|
|
|
|
SSM_DT = auto()
|
|
|
|
SSM_A = auto()
|
|
|
|
SSM_D = auto()
|
|
|
|
SSM_OUT = auto()
|
2024-09-01 16:38:17 +02:00
|
|
|
TIME_MIX_W1 = auto()
|
|
|
|
TIME_MIX_W2 = auto()
|
|
|
|
TIME_MIX_LERP_X = auto()
|
|
|
|
TIME_MIX_LERP_K = auto()
|
|
|
|
TIME_MIX_LERP_V = auto()
|
|
|
|
TIME_MIX_LERP_R = auto()
|
|
|
|
TIME_MIX_LERP_G = auto()
|
|
|
|
TIME_MIX_LERP_W = auto()
|
|
|
|
TIME_MIX_FIRST = auto()
|
|
|
|
TIME_MIX_DECAY = auto()
|
|
|
|
TIME_MIX_DECAY_W1 = auto()
|
|
|
|
TIME_MIX_DECAY_W2 = auto()
|
|
|
|
TIME_MIX_KEY = auto()
|
|
|
|
TIME_MIX_VALUE = auto()
|
|
|
|
TIME_MIX_RECEPTANCE = auto()
|
|
|
|
TIME_MIX_GATE = auto()
|
|
|
|
TIME_MIX_LN = auto()
|
|
|
|
TIME_MIX_OUTPUT = auto()
|
|
|
|
CHANNEL_MIX_LERP_K = auto()
|
|
|
|
CHANNEL_MIX_LERP_R = auto()
|
|
|
|
CHANNEL_MIX_KEY = auto()
|
|
|
|
CHANNEL_MIX_RECEPTANCE = auto()
|
|
|
|
CHANNEL_MIX_VALUE = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_Q_A = auto()
|
|
|
|
ATTN_Q_B = auto()
|
|
|
|
ATTN_KV_A_MQA = auto()
|
|
|
|
ATTN_KV_B = auto()
|
|
|
|
ATTN_Q_A_NORM = auto()
|
|
|
|
ATTN_KV_A_NORM = auto()
|
|
|
|
FFN_SUB_NORM = auto()
|
|
|
|
ATTN_SUB_NORM = auto()
|
|
|
|
DEC_ATTN_NORM = auto()
|
|
|
|
DEC_ATTN_Q = auto()
|
|
|
|
DEC_ATTN_K = auto()
|
|
|
|
DEC_ATTN_V = auto()
|
|
|
|
DEC_ATTN_OUT = auto()
|
|
|
|
DEC_ATTN_REL_B = auto()
|
|
|
|
DEC_CROSS_ATTN_NORM = auto()
|
|
|
|
DEC_CROSS_ATTN_Q = auto()
|
|
|
|
DEC_CROSS_ATTN_K = auto()
|
|
|
|
DEC_CROSS_ATTN_V = auto()
|
|
|
|
DEC_CROSS_ATTN_OUT = auto()
|
|
|
|
DEC_CROSS_ATTN_REL_B = auto()
|
|
|
|
DEC_FFN_NORM = auto()
|
|
|
|
DEC_FFN_GATE = auto()
|
|
|
|
DEC_FFN_DOWN = auto()
|
|
|
|
DEC_FFN_UP = auto()
|
|
|
|
DEC_OUTPUT_NORM = auto()
|
|
|
|
ENC_ATTN_NORM = auto()
|
|
|
|
ENC_ATTN_Q = auto()
|
|
|
|
ENC_ATTN_K = auto()
|
|
|
|
ENC_ATTN_V = auto()
|
|
|
|
ENC_ATTN_OUT = auto()
|
|
|
|
ENC_ATTN_REL_B = auto()
|
|
|
|
ENC_FFN_NORM = auto()
|
|
|
|
ENC_FFN_GATE = auto()
|
|
|
|
ENC_FFN_DOWN = auto()
|
|
|
|
ENC_FFN_UP = auto()
|
|
|
|
ENC_OUTPUT_NORM = auto()
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
|
MODEL_ARCH.LLAMA: "llama",
|
|
|
|
MODEL_ARCH.FALCON: "falcon",
|
|
|
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
2024-03-23 17:41:53 +01:00
|
|
|
MODEL_ARCH.GROK: "grok",
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_ARCH.GPT2: "gpt2",
|
|
|
|
MODEL_ARCH.GPTJ: "gptj",
|
|
|
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
|
|
|
MODEL_ARCH.MPT: "mpt",
|
|
|
|
MODEL_ARCH.STARCODER: "starcoder",
|
|
|
|
MODEL_ARCH.REFACT: "refact",
|
|
|
|
MODEL_ARCH.BERT: "bert",
|
2024-02-13 18:03:53 +01:00
|
|
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
2024-05-11 09:46:09 +02:00
|
|
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_ARCH.BLOOM: "bloom",
|
2023-11-14 11:17:12 +01:00
|
|
|
MODEL_ARCH.STABLELM: "stablelm",
|
2023-12-01 19:16:31 +01:00
|
|
|
MODEL_ARCH.QWEN: "qwen",
|
2024-01-19 12:53:13 +01:00
|
|
|
MODEL_ARCH.QWEN2: "qwen2",
|
2024-04-16 17:40:48 +02:00
|
|
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
2023-12-18 18:27:47 +01:00
|
|
|
MODEL_ARCH.PHI2: "phi2",
|
2024-04-24 09:00:37 +02:00
|
|
|
MODEL_ARCH.PHI3: "phi3",
|
2023-12-24 14:35:49 +01:00
|
|
|
MODEL_ARCH.PLAMO: "plamo",
|
2024-01-19 10:07:27 +01:00
|
|
|
MODEL_ARCH.CODESHELL: "codeshell",
|
2024-01-28 09:00:30 +01:00
|
|
|
MODEL_ARCH.ORION: "orion",
|
2024-02-01 10:19:51 +01:00
|
|
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
2024-02-07 07:15:56 +01:00
|
|
|
MODEL_ARCH.MINICPM: "minicpm",
|
2024-02-21 14:08:22 +01:00
|
|
|
MODEL_ARCH.GEMMA: "gemma",
|
2024-06-28 06:00:43 +02:00
|
|
|
MODEL_ARCH.GEMMA2: "gemma2",
|
2024-03-01 20:30:46 +01:00
|
|
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
2024-09-01 16:38:17 +02:00
|
|
|
MODEL_ARCH.RWKV6: "rwkv6",
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
MODEL_ARCH.MAMBA: "mamba",
|
2024-03-29 14:37:03 +01:00
|
|
|
MODEL_ARCH.XVERSE: "xverse",
|
2024-03-15 21:41:22 +01:00
|
|
|
MODEL_ARCH.COMMAND_R: "command-r",
|
2024-04-13 11:33:52 +02:00
|
|
|
MODEL_ARCH.DBRX: "dbrx",
|
2024-04-19 11:35:54 +02:00
|
|
|
MODEL_ARCH.OLMO: "olmo",
|
2024-07-04 19:14:21 +02:00
|
|
|
MODEL_ARCH.OPENELM: "openelm",
|
2024-05-24 14:31:13 +02:00
|
|
|
MODEL_ARCH.ARCTIC: "arctic",
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
2024-07-07 14:52:10 +02:00
|
|
|
MODEL_ARCH.CHATGLM: "chatglm",
|
2024-06-23 20:27:57 +02:00
|
|
|
MODEL_ARCH.BITNET: "bitnet",
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL_ARCH.T5: "t5",
|
2024-08-10 11:43:26 +02:00
|
|
|
MODEL_ARCH.T5ENCODER: "t5encoder",
|
2024-07-02 16:36:00 +02:00
|
|
|
MODEL_ARCH.JAIS: "jais",
|
2024-08-16 04:23:33 +02:00
|
|
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
2024-08-16 08:35:18 +02:00
|
|
|
MODEL_ARCH.EXAONE: "exaone",
|
2023-11-11 06:04:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
2024-09-01 16:38:17 +02:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
|
|
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
|
|
MODEL_TENSOR.OUTPUT: "output",
|
|
|
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
|
|
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
|
|
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
|
|
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
|
|
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
|
|
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
|
|
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
|
|
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
|
|
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
|
|
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
|
|
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
|
|
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
|
|
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
|
|
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
|
|
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
|
|
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
|
|
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
|
|
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
|
|
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
|
|
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
2023-11-11 06:04:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
|
MODEL_ARCH.LLAMA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2023-12-13 13:04:25 +01:00
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-12-13 13:04:25 +01:00
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
2023-11-11 06:04:50 +01:00
|
|
|
],
|
2024-03-23 17:41:53 +01:00
|
|
|
MODEL_ARCH.GROK: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_ARCH.GPTNEOX: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.FALCON: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BAICHUAN: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.STARCODER: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BERT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-02-11 17:21:38 +01:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
2024-02-11 17:21:38 +01:00
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-02-11 17:21:38 +01:00
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
2023-11-11 06:04:50 +01:00
|
|
|
],
|
2024-02-13 18:03:53 +01:00
|
|
|
MODEL_ARCH.NOMIC_BERT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2024-05-11 09:46:09 +02:00
|
|
|
MODEL_ARCH.JINA_BERT_V2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
2024-06-06 09:22:41 +02:00
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
2024-05-11 09:46:09 +02:00
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_ARCH.MPT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-12-27 16:39:45 +01:00
|
|
|
MODEL_TENSOR.FFN_ACT,
|
2024-04-03 20:05:10 +02:00
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
2023-11-11 06:04:50 +01:00
|
|
|
],
|
|
|
|
MODEL_ARCH.GPTJ: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.REFACT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BLOOM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2023-11-14 11:17:12 +01:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.STABLELM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-04-16 17:48:35 +02:00
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
2023-11-11 06:04:50 +01:00
|
|
|
],
|
2023-12-01 19:16:31 +01:00
|
|
|
MODEL_ARCH.QWEN: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-01-19 12:53:13 +01:00
|
|
|
MODEL_ARCH.QWEN2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-04-16 17:40:48 +02:00
|
|
|
MODEL_ARCH.QWEN2MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2023-12-24 14:35:49 +01:00
|
|
|
MODEL_ARCH.PLAMO: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2023-11-11 06:04:50 +01:00
|
|
|
MODEL_ARCH.GPT2: [
|
2023-12-28 15:03:57 +01:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-11-11 06:04:50 +01:00
|
|
|
],
|
2023-12-18 18:27:47 +01:00
|
|
|
MODEL_ARCH.PHI2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-04-24 09:00:37 +02:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.PHI3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2023-12-18 18:27:47 +01:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
2024-01-13 12:44:37 +01:00
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
2023-12-18 18:27:47 +01:00
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-01-19 10:07:27 +01:00
|
|
|
],
|
|
|
|
MODEL_ARCH.CODESHELL: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-01-28 09:00:30 +01:00
|
|
|
],
|
|
|
|
MODEL_ARCH.ORION: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-02-01 10:19:51 +01:00
|
|
|
MODEL_ARCH.INTERNLM2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-02-07 07:15:56 +01:00
|
|
|
MODEL_ARCH.MINICPM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-06-03 09:49:30 +02:00
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-02-07 07:15:56 +01:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-02-21 14:08:22 +01:00
|
|
|
MODEL_ARCH.GEMMA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
],
|
2024-06-28 06:00:43 +02:00
|
|
|
MODEL_ARCH.GEMMA2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
],
|
2024-03-01 20:30:46 +01:00
|
|
|
MODEL_ARCH.STARCODER2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-09-01 16:38:17 +02:00
|
|
|
MODEL_ARCH.RWKV6: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN,
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
|
|
|
],
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
MODEL_ARCH.MAMBA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_X,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
],
|
2024-03-29 14:37:03 +01:00
|
|
|
MODEL_ARCH.XVERSE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-03-15 21:41:22 +01:00
|
|
|
MODEL_ARCH.COMMAND_R: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-04-09 10:16:13 +02:00
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
2024-03-15 21:41:22 +01:00
|
|
|
],
|
2024-04-13 11:33:52 +02:00
|
|
|
MODEL_ARCH.DBRX: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-04-19 11:35:54 +02:00
|
|
|
MODEL_ARCH.OLMO: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-07-04 19:14:21 +02:00
|
|
|
MODEL_ARCH.OPENELM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-05-24 14:31:13 +02:00
|
|
|
MODEL_ARCH.ARCTIC: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_NORM_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_ARCH.DEEPSEEK2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_A,
|
|
|
|
MODEL_TENSOR.ATTN_Q_B,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
|
|
|
MODEL_TENSOR.ATTN_KV_B,
|
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2024-07-07 14:52:10 +02:00
|
|
|
MODEL_ARCH.CHATGLM : [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-06-23 20:27:57 +02:00
|
|
|
MODEL_ARCH.BITNET: [
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
|
|
|
MODEL_TENSOR.FFN_SUB_NORM,
|
|
|
|
],
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL_ARCH.T5: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_K,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_V,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.DEC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
|
|
|
MODEL_TENSOR.DEC_FFN_UP,
|
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
2024-08-10 11:43:26 +02:00
|
|
|
MODEL_TENSOR.ENC_FFN_UP,
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.T5ENCODER: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL_TENSOR.ENC_FFN_UP,
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
|
|
],
|
2024-07-02 16:36:00 +02:00
|
|
|
MODEL_ARCH.JAIS: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-08-16 04:23:33 +02:00
|
|
|
MODEL_ARCH.NEMOTRON: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-08-16 08:35:18 +02:00
|
|
|
MODEL_ARCH.EXAONE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2023-11-11 06:04:50 +01:00
|
|
|
# TODO
|
|
|
|
}
|
|
|
|
|
|
|
|
# tensors that will not be serialized
|
|
|
|
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
|
MODEL_ARCH.LLAMA: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BAICHUAN: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2023-12-01 19:16:31 +01:00
|
|
|
MODEL_ARCH.QWEN: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-01-19 10:07:27 +01:00
|
|
|
MODEL_ARCH.CODESHELL: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-01-28 09:00:30 +01:00
|
|
|
MODEL_ARCH.ORION: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-03-01 20:30:46 +01:00
|
|
|
MODEL_ARCH.STARCODER2: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2024-03-29 14:37:03 +01:00
|
|
|
],
|
|
|
|
MODEL_ARCH.XVERSE: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2024-03-01 20:30:46 +01:00
|
|
|
],
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_ARCH.DEEPSEEK2: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-07-07 14:52:10 +02:00
|
|
|
MODEL_ARCH.CHATGLM: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
],
|
2024-08-16 04:23:33 +02:00
|
|
|
MODEL_ARCH.NEMOTRON: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2023-11-11 06:04:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# types
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
class TokenType(IntEnum):
|
|
|
|
NORMAL = 1
|
|
|
|
UNKNOWN = 2
|
|
|
|
CONTROL = 3
|
|
|
|
USER_DEFINED = 4
|
|
|
|
UNUSED = 5
|
|
|
|
BYTE = 6
|
|
|
|
|
|
|
|
|
|
|
|
class RopeScalingType(Enum):
|
|
|
|
NONE = 'none'
|
|
|
|
LINEAR = 'linear'
|
|
|
|
YARN = 'yarn'
|
|
|
|
|
|
|
|
|
2024-02-15 18:21:49 +01:00
|
|
|
class PoolingType(IntEnum):
|
|
|
|
NONE = 0
|
|
|
|
MEAN = 1
|
|
|
|
CLS = 2
|
|
|
|
|
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
class GGMLQuantizationType(IntEnum):
|
2024-03-03 09:43:42 +01:00
|
|
|
F32 = 0
|
|
|
|
F16 = 1
|
|
|
|
Q4_0 = 2
|
|
|
|
Q4_1 = 3
|
|
|
|
Q5_0 = 6
|
|
|
|
Q5_1 = 7
|
|
|
|
Q8_0 = 8
|
|
|
|
Q8_1 = 9
|
|
|
|
Q2_K = 10
|
|
|
|
Q3_K = 11
|
|
|
|
Q4_K = 12
|
|
|
|
Q5_K = 13
|
|
|
|
Q6_K = 14
|
|
|
|
Q8_K = 15
|
|
|
|
IQ2_XXS = 16
|
|
|
|
IQ2_XS = 17
|
|
|
|
IQ3_XXS = 18
|
|
|
|
IQ1_S = 19
|
|
|
|
IQ4_NL = 20
|
|
|
|
IQ3_S = 21
|
|
|
|
IQ2_S = 22
|
|
|
|
IQ4_XS = 23
|
2024-03-14 11:40:14 +01:00
|
|
|
I8 = 24
|
|
|
|
I16 = 25
|
|
|
|
I32 = 26
|
gguf : add support for I64 and F64 arrays (#6062)
* gguf : add support for I64 and F64 arrays
GGML currently does not support I64 or F64 arrays and they are not often
used in machine learning, however if in the future the need arises, it
would be nice to add them now, so that the types are next to the other
types I8, I16, I32 in the enums, and it also reserves their type number.
Furthermore, with this addition the GGUF format becomes very usable for
most computational applications of NumPy (being compatible with the most
common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster,
and more versatile alternative to the `npz` format, and a simpler
alternative to the `hdf5` format.
The change in this PR seems small, not significantly increasing the
maintenance burden. I tested this from Python using GGUFWriter/Reader
and `gguf-dump`, as well as from C, everything seems to work.
* Fix compiler warnings
2024-03-15 09:46:51 +01:00
|
|
|
I64 = 27
|
|
|
|
F64 = 28
|
2024-03-26 15:21:27 +01:00
|
|
|
IQ1_M = 29
|
2024-05-08 08:30:09 +02:00
|
|
|
BF16 = 30
|
2024-08-08 19:33:09 +02:00
|
|
|
Q4_0_4_4 = 31
|
|
|
|
Q4_0_4_8 = 32
|
|
|
|
Q4_0_8_8 = 33
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
TQ1_0 = 34
|
|
|
|
TQ2_0 = 35
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
|
2024-05-11 17:06:26 +02:00
|
|
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
|
|
|
|
|
|
|
|
|
|
|
# from llama_ftype in llama.h
|
|
|
|
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
|
|
|
|
class LlamaFileType(IntEnum):
|
|
|
|
ALL_F32 = 0
|
|
|
|
MOSTLY_F16 = 1 # except 1d tensors
|
|
|
|
MOSTLY_Q4_0 = 2 # except 1d tensors
|
|
|
|
MOSTLY_Q4_1 = 3 # except 1d tensors
|
2024-08-08 19:33:09 +02:00
|
|
|
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
2024-05-11 17:06:26 +02:00
|
|
|
# MOSTLY_Q4_2 = 5 # support has been removed
|
|
|
|
# MOSTLY_Q4_3 = 6 # support has been removed
|
|
|
|
MOSTLY_Q8_0 = 7 # except 1d tensors
|
|
|
|
MOSTLY_Q5_0 = 8 # except 1d tensors
|
|
|
|
MOSTLY_Q5_1 = 9 # except 1d tensors
|
|
|
|
MOSTLY_Q2_K = 10 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_S = 11 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_M = 12 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_L = 13 # except 1d tensors
|
|
|
|
MOSTLY_Q4_K_S = 14 # except 1d tensors
|
|
|
|
MOSTLY_Q4_K_M = 15 # except 1d tensors
|
|
|
|
MOSTLY_Q5_K_S = 16 # except 1d tensors
|
|
|
|
MOSTLY_Q5_K_M = 17 # except 1d tensors
|
|
|
|
MOSTLY_Q6_K = 18 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_XXS = 19 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_XS = 20 # except 1d tensors
|
|
|
|
MOSTLY_Q2_K_S = 21 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_XS = 22 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_XXS = 23 # except 1d tensors
|
|
|
|
MOSTLY_IQ1_S = 24 # except 1d tensors
|
|
|
|
MOSTLY_IQ4_NL = 25 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_S = 26 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_M = 27 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_S = 28 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_M = 29 # except 1d tensors
|
|
|
|
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
|
|
|
MOSTLY_IQ1_M = 31 # except 1d tensors
|
|
|
|
MOSTLY_BF16 = 32 # except 1d tensors
|
2024-08-08 19:33:09 +02:00
|
|
|
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
|
|
|
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
|
|
|
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
|
|
|
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
2024-05-11 17:06:26 +02:00
|
|
|
|
|
|
|
GUESSED = 1024 # not specified in the model file
|
|
|
|
|
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
class GGUFEndian(IntEnum):
|
|
|
|
LITTLE = 0
|
|
|
|
BIG = 1
|
|
|
|
|
|
|
|
|
|
|
|
class GGUFValueType(IntEnum):
|
|
|
|
UINT8 = 0
|
|
|
|
INT8 = 1
|
|
|
|
UINT16 = 2
|
|
|
|
INT16 = 3
|
|
|
|
UINT32 = 4
|
|
|
|
INT32 = 5
|
|
|
|
FLOAT32 = 6
|
|
|
|
BOOL = 7
|
|
|
|
STRING = 8
|
|
|
|
ARRAY = 9
|
|
|
|
UINT64 = 10
|
|
|
|
INT64 = 11
|
|
|
|
FLOAT64 = 12
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_type(val: Any) -> GGUFValueType:
|
|
|
|
if isinstance(val, (str, bytes, bytearray)):
|
|
|
|
return GGUFValueType.STRING
|
|
|
|
elif isinstance(val, list):
|
|
|
|
return GGUFValueType.ARRAY
|
|
|
|
elif isinstance(val, float):
|
|
|
|
return GGUFValueType.FLOAT32
|
|
|
|
elif isinstance(val, bool):
|
|
|
|
return GGUFValueType.BOOL
|
|
|
|
elif isinstance(val, int):
|
|
|
|
return GGUFValueType.INT32
|
|
|
|
# TODO: need help with 64-bit types in Python
|
|
|
|
else:
|
2024-05-03 21:36:41 +02:00
|
|
|
raise ValueError(f"Unknown type: {type(val)}")
|
2023-11-11 06:04:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
# Items here are (block size, type size)
|
2024-05-23 09:00:21 +02:00
|
|
|
QK_K = 256
|
2024-05-09 00:16:38 +02:00
|
|
|
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
2024-03-03 09:43:42 +01:00
|
|
|
GGMLQuantizationType.F32: (1, 4),
|
|
|
|
GGMLQuantizationType.F16: (1, 2),
|
|
|
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
|
|
|
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
|
|
|
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
|
|
|
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
|
|
|
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
|
|
|
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
|
|
|
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
|
|
|
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
|
|
|
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
|
|
|
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
|
|
|
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
|
|
|
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
|
|
|
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
|
|
|
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
|
|
|
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
|
|
|
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
|
|
|
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
2024-03-14 11:40:14 +01:00
|
|
|
GGMLQuantizationType.I8: (1, 1),
|
|
|
|
GGMLQuantizationType.I16: (1, 2),
|
|
|
|
GGMLQuantizationType.I32: (1, 4),
|
gguf : add support for I64 and F64 arrays (#6062)
* gguf : add support for I64 and F64 arrays
GGML currently does not support I64 or F64 arrays and they are not often
used in machine learning, however if in the future the need arises, it
would be nice to add them now, so that the types are next to the other
types I8, I16, I32 in the enums, and it also reserves their type number.
Furthermore, with this addition the GGUF format becomes very usable for
most computational applications of NumPy (being compatible with the most
common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster,
and more versatile alternative to the `npz` format, and a simpler
alternative to the `hdf5` format.
The change in this PR seems small, not significantly increasing the
maintenance burden. I tested this from Python using GGUFWriter/Reader
and `gguf-dump`, as well as from C, everything seems to work.
* Fix compiler warnings
2024-03-15 09:46:51 +01:00
|
|
|
GGMLQuantizationType.I64: (1, 8),
|
|
|
|
GGMLQuantizationType.F64: (1, 8),
|
2024-04-21 14:49:30 +02:00
|
|
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
2024-05-08 08:30:09 +02:00
|
|
|
GGMLQuantizationType.BF16: (1, 2),
|
2024-08-08 19:33:09 +02:00
|
|
|
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
|
|
|
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
|
|
|
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
|
|
|
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
2023-11-11 06:04:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Aliases for backward compatibility.
|
|
|
|
|
|
|
|
# general
|
|
|
|
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
|
|
|
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
|
|
|
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
|
|
|
KEY_GENERAL_NAME = Keys.General.NAME
|
|
|
|
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
|
|
|
KEY_GENERAL_URL = Keys.General.URL
|
|
|
|
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
|
|
|
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
|
|
|
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
|
|
|
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
|
|
|
|
|
|
|
# LLM
|
2024-03-14 17:21:56 +01:00
|
|
|
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
2023-11-11 06:04:50 +01:00
|
|
|
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
|
|
|
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
|
|
|
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
|
|
|
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
|
|
|
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
|
|
|
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
|
|
|
|
|
|
|
# attention
|
|
|
|
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
|
|
|
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
|
|
|
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
|
|
|
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
|
|
|
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
|
|
|
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
|
|
|
|
|
|
|
# RoPE
|
|
|
|
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
|
|
|
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
|
|
|
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
|
|
|
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
|
|
|
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
|
|
|
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
|
|
|
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
# SSM
|
|
|
|
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
|
|
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
|
|
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
|
|
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
2024-08-21 10:06:36 +02:00
|
|
|
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 23:31:00 +01:00
|
|
|
|
2023-11-11 06:04:50 +01:00
|
|
|
# tokenization
|
|
|
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
2024-04-29 15:58:41 +02:00
|
|
|
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
|
2023-11-11 06:04:50 +01:00
|
|
|
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
|
|
|
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
|
|
|
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
|
|
|
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
|
|
|
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
|
|
|
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
|
|
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
|
|
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
|
|
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
2024-02-15 14:14:37 +01:00
|
|
|
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
|
|
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
2023-11-11 06:04:50 +01:00
|
|
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
|
|
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
2024-04-16 08:13:13 +02:00
|
|
|
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
|
|
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
|
|
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
|
|
|
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
2024-08-05 09:38:01 +02:00
|
|
|
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|