From 806d397c1a13416c10e01908d95b04ce0e9fa8a4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Sep 2023 13:21:36 +0300 Subject: [PATCH] parallel : try smaller batches when the KV cache is fragmented --- examples/parallel/parallel.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 3c3fe6ddb..c35552e4a 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -83,7 +83,7 @@ int main(int argc, char ** argv) { const int n_clients = 8; // insert new requests as soon as the previous one is done - const bool hot_plug = false; + const bool hot_plug = true; // requests to simulate const int32_t n_seq = 128; @@ -202,8 +202,10 @@ int main(int argc, char ** argv) { } // process in chunks of params.n_batch - for (size_t i = 0; i < batch_token.size(); i += params.n_batch) { - n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i)); + int32_t n_batch = params.n_batch; + + for (int32_t i = 0; i < (int32_t) batch_token.size(); i += n_batch) { + n_tokens = std::min(n_batch, (int32_t) (batch_token.size() - i)); llama_batch batch = { n_tokens, @@ -216,10 +218,22 @@ int main(int argc, char ** argv) { }; if (llama_decode(ctx, batch, params.n_threads)) { - LOG_TEE("%s : failed to decode batch\n", __func__); - return 1; + if (n_batch == 1) { + LOG_TEE("%s : failed to decode batch\n", __func__); + return 1; + } + + LOG("%s : failed to decode batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + + continue; } + LOG_TEE("%s : decoded batch of %d tokens\n", __func__, n_tokens); + for (auto & client : clients) { if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { continue;