mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 06:39:25 +01:00
server : better security control for public deployments (#9776)
* server : more explicit endpoint access settings * protect /props endpoint * fix tests * update server docs * fix typo * fix tests
This commit is contained in:
parent
fa42aa6d89
commit
458367a906
@ -1838,9 +1838,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.endpoint_metrics = true;
|
params.endpoint_metrics = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--slots"},
|
||||||
|
format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||||
|
[](gpt_params & params) {
|
||||||
|
params.endpoint_slots = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--props"},
|
||||||
|
format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
||||||
|
[](gpt_params & params) {
|
||||||
|
params.endpoint_props = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--no-slots"},
|
{"--no-slots"},
|
||||||
format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
"disables slots monitoring endpoint",
|
||||||
[](gpt_params & params) {
|
[](gpt_params & params) {
|
||||||
params.endpoint_slots = false;
|
params.endpoint_slots = false;
|
||||||
}
|
}
|
||||||
|
@ -290,7 +290,10 @@ struct gpt_params {
|
|||||||
std::string ssl_file_key = ""; // NOLINT
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
std::string ssl_file_cert = ""; // NOLINT
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
// "advanced" endpoints are disabled by default for better security
|
||||||
|
bool webui = true;
|
||||||
|
bool endpoint_slots = false;
|
||||||
|
bool endpoint_props = false; // only control POST requests, not GET
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
<!-- Note for contributors: The list below is generated by llama-gen-docs -->
|
||||||
|
|
||||||
**Common params**
|
**Common params**
|
||||||
|
|
||||||
| Argument | Explanation |
|
| Argument | Explanation |
|
||||||
@ -149,7 +151,9 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||||
|
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||||
|
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||||
@ -380,8 +384,6 @@ node index.js
|
|||||||
|
|
||||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
||||||
|
|
||||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
|
||||||
|
|
||||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
@ -519,34 +521,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B
|
|||||||
|
|
||||||
Takes a prefix and a suffix and returns the predicted completion as stream.
|
Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`input_prefix`: Set the prefix of the code to infill.
|
- `input_prefix`: Set the prefix of the code to infill.
|
||||||
|
- `input_suffix`: Set the suffix of the code to infill.
|
||||||
|
|
||||||
`input_suffix`: Set the suffix of the code to infill.
|
It also accepts all the options of `/completion` except `stream` and `prompt`.
|
||||||
|
|
||||||
It also accepts all the options of `/completion` except `stream` and `prompt`.
|
### **GET** `/props`: Get server global properties.
|
||||||
|
|
||||||
- **GET** `/props`: Return current server settings.
|
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"assistant_name": "",
|
"system_prompt": "",
|
||||||
"user_name": "",
|
|
||||||
"default_generation_settings": { ... },
|
"default_generation_settings": { ... },
|
||||||
"total_slots": 1,
|
"total_slots": 1,
|
||||||
"chat_template": ""
|
"chat_template": ""
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
|
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
|
||||||
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
|
||||||
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
||||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||||
- `chat_template` - the model's original Jinja2 prompt template
|
- `chat_template` - the model's original Jinja2 prompt template
|
||||||
|
|
||||||
|
### POST `/props`: Change server global properties.
|
||||||
|
|
||||||
|
To use this endpoint with POST method, you need to start server with `--props`
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
|
||||||
|
|
||||||
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
||||||
|
|
||||||
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||||
@ -813,28 +822,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`
|
|||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
|
||||||
|
|
||||||
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
|
|
||||||
|
|
||||||
`prompt`: Specify a context that you want all connecting clients to respect.
|
|
||||||
|
|
||||||
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
|
|
||||||
|
|
||||||
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"system_prompt": {
|
|
||||||
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
|
|
||||||
"anti_prompt": "User:",
|
|
||||||
"assistant_name": "Assistant:"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
|
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
Check the sample in [chat.mjs](chat.mjs).
|
Check the sample in [chat.mjs](chat.mjs).
|
||||||
|
@ -1106,12 +1106,7 @@ struct server_context {
|
|||||||
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
|
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
|
||||||
|
|
||||||
system_prompt = sys_prompt;
|
system_prompt = sys_prompt;
|
||||||
|
// update system_tokens and KV cache as soon as all slots are idle
|
||||||
// release all slots
|
|
||||||
for (server_slot & slot : slots) {
|
|
||||||
slot.release();
|
|
||||||
}
|
|
||||||
|
|
||||||
system_need_update = true;
|
system_need_update = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1627,16 +1622,6 @@ struct server_context {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.data.contains("system_prompt")) {
|
|
||||||
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
|
||||||
system_prompt_set(sys_prompt);
|
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
|
||||||
slot.n_past = 0;
|
|
||||||
slot.n_past_se = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slot->reset();
|
slot->reset();
|
||||||
|
|
||||||
slot->id_task = task.id;
|
slot->id_task = task.id;
|
||||||
@ -1862,10 +1847,6 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void update_slots() {
|
void update_slots() {
|
||||||
if (system_need_update) {
|
|
||||||
system_prompt_update();
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if all slots are idle
|
// check if all slots are idle
|
||||||
{
|
{
|
||||||
bool all_idle = true;
|
bool all_idle = true;
|
||||||
@ -1878,6 +1859,10 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (all_idle) {
|
if (all_idle) {
|
||||||
|
if (system_need_update) {
|
||||||
|
system_prompt_update();
|
||||||
|
}
|
||||||
|
|
||||||
SRV_INF("%s", "all slots are idle\n");
|
SRV_INF("%s", "all slots are idle\n");
|
||||||
if (system_prompt.empty() && clean_kv_cache) {
|
if (system_prompt.empty() && clean_kv_cache) {
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
@ -2536,20 +2521,10 @@ int main(int argc, char ** argv) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||||
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
|
static const std::unordered_set<std::string> public_endpoints = {
|
||||||
static const std::unordered_set<std::string> protected_endpoints = {
|
"/health",
|
||||||
"/props",
|
"/models",
|
||||||
"/completion",
|
"/v1/models",
|
||||||
"/completions",
|
|
||||||
"/v1/completions",
|
|
||||||
"/chat/completions",
|
|
||||||
"/v1/chat/completions",
|
|
||||||
"/infill",
|
|
||||||
"/tokenize",
|
|
||||||
"/detokenize",
|
|
||||||
"/embedding",
|
|
||||||
"/embeddings",
|
|
||||||
"/v1/embeddings",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// If API key is not set, skip validation
|
// If API key is not set, skip validation
|
||||||
@ -2557,8 +2532,8 @@ int main(int argc, char ** argv) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If path is not in protected_endpoints list, skip validation
|
// If path is public, skip validation
|
||||||
if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
|
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2620,7 +2595,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (!params.endpoint_slots) {
|
if (!params.endpoint_slots) {
|
||||||
res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2869,24 +2844,31 @@ int main(int argc, char ** argv) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||||
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
|
||||||
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
|
||||||
if (tlen > 0) {
|
|
||||||
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
|
|
||||||
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
|
|
||||||
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
json data = {
|
json data = {
|
||||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
{ "system_prompt", ctx_server.system_prompt },
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params.n_parallel },
|
{ "total_slots", ctx_server.params.n_parallel },
|
||||||
{ "chat_template", curr_tmpl.c_str() },
|
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||||
};
|
};
|
||||||
|
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
if (!ctx_server.params.endpoint_props) {
|
||||||
|
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
json data = json::parse(req.body);
|
||||||
|
if (data.contains("system_prompt")) {
|
||||||
|
std::string system_prompt = data.at("system_prompt");
|
||||||
|
ctx_server.system_prompt_set(system_prompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
res_ok(res, {{ "success", true }});
|
||||||
|
};
|
||||||
|
|
||||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
|
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
|
||||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
@ -3265,6 +3247,12 @@ int main(int argc, char ** argv) {
|
|||||||
svr->set_base_dir(params.public_path);
|
svr->set_base_dir(params.public_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!params.api_keys.empty()) {
|
||||||
|
// for now, if API key is set, web UI is unusable
|
||||||
|
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
|
||||||
|
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
|
||||||
|
});
|
||||||
|
} else {
|
||||||
// using embedded static files
|
// using embedded static files
|
||||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||||
@ -3283,12 +3271,15 @@ int main(int argc, char ** argv) {
|
|||||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||||
|
}
|
||||||
|
|
||||||
// register API routes
|
// register API routes
|
||||||
svr->Get ("/health", handle_health);
|
svr->Get ("/health", handle_health); // public endpoint (no API key check)
|
||||||
svr->Get ("/metrics", handle_metrics);
|
svr->Get ("/metrics", handle_metrics);
|
||||||
svr->Get ("/props", handle_props);
|
svr->Get ("/props", handle_props);
|
||||||
svr->Get ("/v1/models", handle_models);
|
svr->Post("/props", handle_props_change);
|
||||||
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||||
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||||
svr->Post("/completion", handle_completions); // legacy
|
svr->Post("/completion", handle_completions); // legacy
|
||||||
svr->Post("/completions", handle_completions);
|
svr->Post("/completions", handle_completions);
|
||||||
svr->Post("/v1/completions", handle_completions);
|
svr->Post("/v1/completions", handle_completions);
|
||||||
|
@ -5,7 +5,7 @@ Feature: Security
|
|||||||
Background: Server startup with an api key defined
|
Background: Server startup with an api key defined
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
And a server api key llama.cpp
|
And a server api key THIS_IS_THE_KEY
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
|
||||||
@ -17,8 +17,8 @@ Feature: Security
|
|||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| api_key | api_error |
|
| api_key | api_error |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| hackeme | raised |
|
| hackeme | raised |
|
||||||
| | raised |
|
| | raised |
|
||||||
|
|
||||||
@ -33,8 +33,8 @@ Feature: Security
|
|||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| api_key | api_error |
|
| api_key | api_error |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| hackme | raised |
|
| hackme | raised |
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility (invalid response formats)
|
Scenario Outline: OAI Compatibility (invalid response formats)
|
||||||
@ -55,7 +55,7 @@ Feature: Security
|
|||||||
|
|
||||||
|
|
||||||
Scenario Outline: CORS Options
|
Scenario Outline: CORS Options
|
||||||
Given a user api key llama.cpp
|
Given a user api key THIS_IS_THE_KEY
|
||||||
When an OPTIONS request is sent from <origin>
|
When an OPTIONS request is sent from <origin>
|
||||||
Then CORS header <cors_header> is set to <cors_header_value>
|
Then CORS header <cors_header> is set to <cors_header_value>
|
||||||
|
|
||||||
|
@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context,
|
|||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
while True:
|
while True:
|
||||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
headers = {'Authorization': f'Bearer {context.server_api_key}'}
|
||||||
|
async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
|
||||||
status_code = slots_response.status
|
status_code = slots_response.status
|
||||||
slots = await slots_response.json()
|
slots = await slots_response.json()
|
||||||
if context.debug:
|
if context.debug:
|
||||||
@ -1387,6 +1388,7 @@ def start_server_background(context):
|
|||||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_listen_addr = context.server_fqdn
|
server_listen_addr = context.server_fqdn
|
||||||
server_args = [
|
server_args = [
|
||||||
|
'--slots', # requires to get slot status via /slots endpoint
|
||||||
'--host', server_listen_addr,
|
'--host', server_listen_addr,
|
||||||
'--port', context.server_port,
|
'--port', context.server_port,
|
||||||
]
|
]
|
||||||
|
@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_get_chat_template(const struct llama_model * model) {
|
||||||
|
std::string template_key = "tokenizer.chat_template";
|
||||||
|
// call with NULL buffer to get the total size of the string
|
||||||
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
||||||
|
if (res < 0) {
|
||||||
|
return "";
|
||||||
|
} else {
|
||||||
|
std::vector<char> model_template(res, 0);
|
||||||
|
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
|
return std::string(model_template.data(), model_template.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
|
@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
|
|||||||
0x003000,
|
0x003000,
|
||||||
};
|
};
|
||||||
|
|
||||||
// list is always in ascending order, to enable binary searh
|
// list is always in ascending order, to enable binary search
|
||||||
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
|
||||||
{0x000041, 0x000061},
|
{0x000041, 0x000061},
|
||||||
{0x000042, 0x000062},
|
{0x000042, 0x000062},
|
||||||
@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
|
|||||||
{0x01E921, 0x01E943},
|
{0x01E921, 0x01E943},
|
||||||
};
|
};
|
||||||
|
|
||||||
// list is always in ascending order, to enable binary searh
|
// list is always in ascending order, to enable binary search
|
||||||
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
|
||||||
{0x000061, 0x000041},
|
{0x000061, 0x000041},
|
||||||
{0x000062, 0x000042},
|
{0x000062, 0x000042},
|
||||||
|
Loading…
Reference in New Issue
Block a user