mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
allow server to multithread
because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked.
This commit is contained in:
parent
a30d4b2a8f
commit
7a3895641c
@ -2,8 +2,6 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
|
|
||||||
// single thread
|
|
||||||
#define CPPHTTPLIB_THREAD_POOL_COUNT 1
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||||
@ -34,7 +32,6 @@ struct server_params {
|
|||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// completion token output with probabilities
|
// completion token output with probabilities
|
||||||
@ -183,6 +180,12 @@ struct llama_server_context {
|
|||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
int32_t multibyte_pending = 0;
|
int32_t multibyte_pending = 0;
|
||||||
|
|
||||||
|
std::mutex mutex;
|
||||||
|
|
||||||
|
std::unique_lock<std::mutex> lock() {
|
||||||
|
return std::unique_lock<std::mutex>(mutex);
|
||||||
|
}
|
||||||
|
|
||||||
~llama_server_context() {
|
~llama_server_context() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -912,6 +915,7 @@ int main(int argc, char ** argv) {
|
|||||||
Server svr;
|
Server svr;
|
||||||
|
|
||||||
svr.set_default_headers({
|
svr.set_default_headers({
|
||||||
|
{ "Server", "llama.cpp" },
|
||||||
{ "Access-Control-Allow-Origin", "*" },
|
{ "Access-Control-Allow-Origin", "*" },
|
||||||
{ "Access-Control-Allow-Headers", "content-type" }
|
{ "Access-Control-Allow-Headers", "content-type" }
|
||||||
});
|
});
|
||||||
@ -929,7 +933,10 @@ int main(int argc, char ** argv) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request & req, Response & res) {
|
svr.Post("/completion", [&llama](const Request & req, Response & res) {
|
||||||
|
auto lock = llama.lock();
|
||||||
|
|
||||||
llama.rewind();
|
llama.rewind();
|
||||||
|
|
||||||
llama_reset_timings(llama.ctx);
|
llama_reset_timings(llama.ctx);
|
||||||
|
|
||||||
parse_options_completion(json::parse(req.body), llama);
|
parse_options_completion(json::parse(req.body), llama);
|
||||||
@ -1038,6 +1045,8 @@ int main(int argc, char ** argv) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
||||||
|
auto lock = llama.lock();
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
const std::string content = body.value("content", "");
|
const std::string content = body.value("content", "");
|
||||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
||||||
@ -1046,6 +1055,8 @@ int main(int argc, char ** argv) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
|
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
|
||||||
|
auto lock = llama.lock();
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
llama.rewind();
|
llama.rewind();
|
||||||
|
Loading…
Reference in New Issue
Block a user