mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-03 17:51:09 +01:00
allow server to multithread
because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked.
This commit is contained in:
parent
a30d4b2a8f
commit
7a3895641c
@ -2,8 +2,6 @@
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
// single thread
|
||||
#define CPPHTTPLIB_THREAD_POOL_COUNT 1
|
||||
#ifndef NDEBUG
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||
@ -34,7 +32,6 @@ struct server_params {
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
|
||||
};
|
||||
|
||||
// completion token output with probabilities
|
||||
@ -183,6 +180,12 @@ struct llama_server_context {
|
||||
std::string stopping_word;
|
||||
int32_t multibyte_pending = 0;
|
||||
|
||||
std::mutex mutex;
|
||||
|
||||
std::unique_lock<std::mutex> lock() {
|
||||
return std::unique_lock<std::mutex>(mutex);
|
||||
}
|
||||
|
||||
~llama_server_context() {
|
||||
if (ctx) {
|
||||
llama_free(ctx);
|
||||
@ -912,6 +915,7 @@ int main(int argc, char ** argv) {
|
||||
Server svr;
|
||||
|
||||
svr.set_default_headers({
|
||||
{ "Server", "llama.cpp" },
|
||||
{ "Access-Control-Allow-Origin", "*" },
|
||||
{ "Access-Control-Allow-Headers", "content-type" }
|
||||
});
|
||||
@ -929,7 +933,10 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
svr.Post("/completion", [&llama](const Request & req, Response & res) {
|
||||
auto lock = llama.lock();
|
||||
|
||||
llama.rewind();
|
||||
|
||||
llama_reset_timings(llama.ctx);
|
||||
|
||||
parse_options_completion(json::parse(req.body), llama);
|
||||
@ -1038,6 +1045,8 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
||||
auto lock = llama.lock();
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
const std::string content = body.value("content", "");
|
||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
||||
@ -1046,6 +1055,8 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
|
||||
auto lock = llama.lock();
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
llama.rewind();
|
||||
|
Loading…
Reference in New Issue
Block a user