diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 031824e14..1d30a15a6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1406,7 +1406,7 @@ struct llama_server_context task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests - if (task.data.at("prompt").size() > 1) + if (task.data.count("prompt") && task.data.at("prompt").size() > 1) { lock.unlock(); // entering new func scope return split_multiprompt_task(task); @@ -1577,9 +1577,9 @@ struct llama_server_context slot->reset(); - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; slot->multitask_id = task.multitask_id; if (!launch_slot_with_data(slot, task.data)) @@ -1731,7 +1731,8 @@ struct llama_server_context const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + // note: infill mode allows empty prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill) { slot.release(); slot.print_timings(); @@ -2609,8 +2610,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, {"usage", json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, {"id", gen_chatcmplid()}}; if (server_verbose) {