mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 12:21:40 +01:00
gemma : use more bits for the token_embd.weight tensor (#5650)
* gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type
This commit is contained in:
parent
847eedbdb2
commit
96633eeca1
@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||||||
return std::make_pair(i_layer, n_layer);
|
return std::make_pair(i_layer, n_layer);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
||||||
|
// with the quantization of the output tensor
|
||||||
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
||||||
|
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
|
Loading…
Reference in New Issue
Block a user