allow server to multithread

because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked.
2025-02-10 02:03:07 +01:00 · 2023-06-27 13:19:24 -04:00 · 2023-06-27 13:19:24 -04:00 · 7a3895641c
commit 7a3895641c
parent a30d4b2a8f
1 changed files with 14 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2,8 +2,6 @@
 #include "llama.h"
 #include "build-info.h"

-// single thread
-#define CPPHTTPLIB_THREAD_POOL_COUNT 1
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
@ -34,7 +32,6 @@ struct server_params {
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
-
 };

 // completion token output with probabilities
@ -183,6 +180,12 @@ struct llama_server_context {
    std::string stopping_word;
    int32_t multibyte_pending = 0;

+    std::mutex mutex;
+
+    std::unique_lock<std::mutex> lock() {
+        return std::unique_lock<std::mutex>(mutex);
+    }
+
    ~llama_server_context() {
        if (ctx) {
            llama_free(ctx);
@ -912,6 +915,7 @@ int main(int argc, char ** argv) {
    Server svr;

    svr.set_default_headers({
+        { "Server", "llama.cpp" },
        { "Access-Control-Allow-Origin", "*" },
        { "Access-Control-Allow-Headers", "content-type" }
    });
@ -929,7 +933,10 @@ int main(int argc, char ** argv) {
    });

    svr.Post("/completion", [&llama](const Request & req, Response & res) {
+        auto lock = llama.lock();
+
        llama.rewind();
+
        llama_reset_timings(llama.ctx);

        parse_options_completion(json::parse(req.body), llama);
@ -1038,6 +1045,8 @@ int main(int argc, char ** argv) {
    });

    svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
+        auto lock = llama.lock();
+
        const json body = json::parse(req.body);
        const std::string content = body.value("content", "");
        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
@ -1046,6 +1055,8 @@ int main(int argc, char ** argv) {
    });

    svr.Post("/embedding", [&llama](const Request & req, Response & res) {
+        auto lock = llama.lock();
+
        const json body = json::parse(req.body);

        llama.rewind();