diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cd17cc711..eb6915c27 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,8 +2,6 @@ #include "llama.h" #include "build-info.h" -// single thread -#define CPPHTTPLIB_THREAD_POOL_COUNT 1 #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error #define CPPHTTPLIB_NO_EXCEPTIONS 1 @@ -34,7 +32,6 @@ struct server_params { int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; - }; // completion token output with probabilities @@ -183,6 +180,12 @@ struct llama_server_context { std::string stopping_word; int32_t multibyte_pending = 0; + std::mutex mutex; + + std::unique_lock lock() { + return std::unique_lock(mutex); + } + ~llama_server_context() { if (ctx) { llama_free(ctx); @@ -912,6 +915,7 @@ int main(int argc, char ** argv) { Server svr; svr.set_default_headers({ + { "Server", "llama.cpp" }, { "Access-Control-Allow-Origin", "*" }, { "Access-Control-Allow-Headers", "content-type" } }); @@ -929,7 +933,10 @@ int main(int argc, char ** argv) { }); svr.Post("/completion", [&llama](const Request & req, Response & res) { + auto lock = llama.lock(); + llama.rewind(); + llama_reset_timings(llama.ctx); parse_options_completion(json::parse(req.body), llama); @@ -1038,6 +1045,8 @@ int main(int argc, char ** argv) { }); svr.Post("/tokenize", [&llama](const Request & req, Response & res) { + auto lock = llama.lock(); + const json body = json::parse(req.body); const std::string content = body.value("content", ""); const std::vector tokens = llama_tokenize(llama.ctx, content, false); @@ -1046,6 +1055,8 @@ int main(int argc, char ** argv) { }); svr.Post("/embedding", [&llama](const Request & req, Response & res) { + auto lock = llama.lock(); + const json body = json::parse(req.body); llama.rewind();