mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 22:38:58 +01:00
llama : quantization-related fixes for T5
This commit is contained in:
parent
7d7fff4654
commit
6dc9eb4040
@ -17197,8 +17197,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
//
|
//
|
||||||
// - qs.n_attention_wv == 0 for Mamba models
|
// - qs.n_attention_wv == 0 for Mamba models
|
||||||
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
||||||
|
// - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
|
||||||
//
|
//
|
||||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
@ -17323,6 +17324,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
quantize &= name.find("ssm_x.weight") == std::string::npos;
|
quantize &= name.find("ssm_x.weight") == std::string::npos;
|
||||||
quantize &= name.find("ssm_dt.weight") == std::string::npos;
|
quantize &= name.find("ssm_dt.weight") == std::string::npos;
|
||||||
|
|
||||||
|
// do not quantize relative position bias (T5)
|
||||||
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
size_t new_size;
|
size_t new_size;
|
||||||
|
Loading…
Reference in New Issue
Block a user