server : do not block system prompt update (#3767)

* server : do not block system prompt update

* server : update state machine logic to process system prompts

* server : minor
This commit is contained in:
Georgi Gerganov 2023-10-24 23:08:20 +03:00 committed by GitHub
parent b2f7e04bd3
commit 1717521cdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -454,7 +454,7 @@ struct llama_client_slot
} }
void release() { void release() {
if (state == PROCESSING) if (state == IDLE || state == PROCESSING)
{ {
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE; command = RELEASE;
@ -754,6 +754,7 @@ struct llama_server_context
} }
slot->params.antiprompt.clear(); slot->params.antiprompt.clear();
const auto &stop = data.find("stop"); const auto &stop = data.find("stop");
if (stop != data.end() && stop->is_array()) if (stop != data.end() && stop->is_array())
{ {
@ -867,7 +868,7 @@ struct llama_server_context
kv_cache_clear(); kv_cache_clear();
for (int32_t i = 0; i < batch.n_tokens; ++i) for (int i = 0; i < (int) system_tokens.size(); ++i)
{ {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false); llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
} }
@ -894,16 +895,8 @@ struct llama_server_context
{ {
slot.release(); slot.release();
} }
wait_all_are_idle();
all_slots_are_idle = true;
// wait until system prompt load
system_need_update = true; system_need_update = true;
while (system_need_update)
{
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}
// system prompt loaded, continue
} }
void process_system_prompt_data(const json &sys_props) { void process_system_prompt_data(const json &sys_props) {
@ -915,26 +908,6 @@ struct llama_server_context
{ {
notify_system_prompt_changed(); notify_system_prompt_changed();
} }
else
{
system_need_update = true;
}
}
void wait_all_are_idle() {
bool wait = true;
while (wait)
{
wait = false;
for (auto &slot : slots)
{
if (!slot.available())
{
wait = true;
break;
}
}
}
} }
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@ -965,7 +938,6 @@ struct llama_server_context
slot.has_next_token = false; slot.has_next_token = false;
} }
stop_pos = pos; stop_pos = pos;
} }
} }
@ -1444,7 +1416,7 @@ struct llama_server_context
process_tasks(); process_tasks();
// update the system prompt wait until all slots are idle state // update the system prompt wait until all slots are idle state
if (system_need_update) if (system_need_update && all_slots_are_idle)
{ {
LOG_TEE("updating system prompt\n"); LOG_TEE("updating system prompt\n");
update_system_prompt(); update_system_prompt();
@ -1498,7 +1470,7 @@ struct llama_server_context
for (auto & slot : slots) for (auto & slot : slots)
{ {
// release the slot // release the slot
if (slot.state == PROCESSING && slot.command == RELEASE) if (slot.command == RELEASE)
{ {
slot.state = IDLE; slot.state = IDLE;
slot.command = NONE; slot.command = NONE;
@ -1509,7 +1481,7 @@ struct llama_server_context
continue; continue;
} }
if (slot.state == IDLE || slot.command == RELEASE) if (slot.state == IDLE)
{ {
continue; continue;
} }
@ -1530,6 +1502,17 @@ struct llama_server_context
{ {
for (auto & slot : slots) for (auto & slot : slots)
{ {
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
// empty prompt passed -> release the slot and send empty response
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
{
slot.release();
slot.print_timings();
send_final_response(slot);
continue;
}
// need process the prompt // need process the prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT) if (slot.state == IDLE && slot.command == LOAD_PROMPT)
{ {
@ -1749,8 +1732,8 @@ struct llama_server_context
if (!process_token(result, slot)) if (!process_token(result, slot))
{ {
slot.release(); slot.release();
send_final_response(slot);
slot.print_timings(); slot.print_timings();
send_final_response(slot);
} }
slot.i_batch = -1; slot.i_batch = -1;