diff --git a/llama.cpp b/llama.cpp index 4ac46193c..0a5f755ca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8443,6 +8443,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; + // do not quantize expert gating tensors + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;