mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-03 07:13:00 +01:00
llama : do not cap thread count when MoE on CPU (#5419)
* Not capping thread count when MoE inference is running on CPU * Whitespace
This commit is contained in:
parent
e4124c2477
commit
e5ca3937c6
@ -7285,7 +7285,9 @@ static int llama_decode_internal(
|
|||||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||||
// with the BLAS calls. need a better solution
|
// with the BLAS calls. need a better solution
|
||||||
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
||||||
|
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
||||||
|
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||||
n_threads = std::min(4, n_threads);
|
n_threads = std::min(4, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user