From 42cadc74bda60afafb45b71b1a39d150ede0ed4d Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Sat, 2 Nov 2024 16:34:56 +0000 Subject: [PATCH] server : fix slot selection by lru (#10126) * server : fix slot selection by lru, migrate lcs to `size_t` * minor debug log fix --- examples/server/server.cpp | 14 ++++++++------ examples/server/utils.hpp | 14 +++++++------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5c1af549b..8531a784d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -247,6 +247,7 @@ struct server_slot { if (is_processing()) { SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); + t_last_used = ggml_time_us(); t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; state = SLOT_STATE_IDLE; callback_on_release(id); @@ -730,7 +731,7 @@ struct server_context { // find the slot that has at least n% prompt similarity if (ret == nullptr && slot_prompt_similarity != 0.0f) { - int max_lcs_len = 0; + int lcs_len = 0; float similarity = 0; for (server_slot & slot : slots) { @@ -745,20 +746,21 @@ struct server_context { } // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); + int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); // fraction of the common subsequence length compared to the current slot's prompt length - similarity = static_cast(lcs_len) / static_cast(slot.cache_tokens.size()); + float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); // select the current slot if the criteria match - if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { - max_lcs_len = lcs_len; + if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) { + lcs_len = cur_lcs_len; + similarity = cur_similarity; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); + SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity); } } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 871a17a4f..c47ed3e47 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok } // get the lengths of the input sequences - int a_len = a.size(); - int b_len = b.size(); + size_t a_len = a.size(); + size_t b_len = b.size(); // initialize the maximum length of the longest common subsequence (LCS) - int max_length = 0; + size_t max_length = 0; // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); // iterate through the elements of a - for (int i = 1; i <= a_len; i++) { + for (size_t i = 1; i <= a_len; i++) { // iterate through the elements of b - for (int j = 1; j <= b_len; j++) { + for (size_t j = 1; j <= b_len; j++) { // if elements at the current positions match if (a[i - 1] == b[j - 1]) { // if it's the first element of either sequences, set LCS length to 1