llama : fix kv cache heuristic when context is less than 32

This commit is contained in:
Georgi Gerganov 2023-09-27 18:12:43 +03:00
parent 8845160058
commit c1596f633f
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -4117,7 +4117,7 @@ static int llama_decode_internal(
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
kv_self.n = std::min((int32_t) hparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
//printf("kv_self.n = %d\n", kv_self.n);