mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
common: llama_load_model_from_url split support (#6192)
* llama: llama_split_prefix fix strncpy does not include string termination common: llama_load_model_from_url: - fix header name case sensitive - support downloading additional split in parallel - hide password in url * common: EOL EOF * common: remove redundant LLAMA_CURL_MAX_PATH_LENGTH definition * common: change max url max length * common: minor comment * server: support HF URL options * llama: llama_model_loader fix log * common: use a constant for max url length * common: clean up curl if file cannot be loaded in gguf * server: tests: add split tests, and HF options params * common: move llama_download_hide_password_in_url inside llama_download_file as a lambda * server: tests: enable back Release test on PR * spacing Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * spacing Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * spacing Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1997577d5e
commit
f482bb2e49
1
.github/workflows/server.yml
vendored
1
.github/workflows/server.yml
vendored
@ -35,7 +35,6 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
disabled_on_pr: true
|
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
container:
|
container:
|
||||||
|
@ -39,6 +39,9 @@
|
|||||||
#endif
|
#endif
|
||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(LLAMA_USE_CURL)
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
|
#include <curl/easy.h>
|
||||||
|
#include <thread>
|
||||||
|
#include <future>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
@ -61,7 +64,7 @@
|
|||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
|
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
||||||
#endif // LLAMA_USE_CURL
|
#endif // LLAMA_USE_CURL
|
||||||
|
|
||||||
@ -1702,27 +1705,13 @@ void llama_batch_add(
|
|||||||
|
|
||||||
#ifdef LLAMA_USE_CURL
|
#ifdef LLAMA_USE_CURL
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(
|
static bool llama_download_file(CURL * curl, const char * url, const char * path) {
|
||||||
const char * model_url,
|
bool force_download = false;
|
||||||
const char * path_model,
|
|
||||||
const struct llama_model_params & params) {
|
|
||||||
// Basic validation of the model_url
|
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
|
||||||
fprintf(stderr, "%s: invalid model_url\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize libcurl globally
|
|
||||||
auto curl = curl_easy_init();
|
|
||||||
|
|
||||||
if (!curl) {
|
|
||||||
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the URL, allow to follow http redirection
|
// Set the URL, allow to follow http redirection
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, model_url);
|
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||||
// operating system. Currently implemented under MS-Windows.
|
// operating system. Currently implemented under MS-Windows.
|
||||||
@ -1731,16 +1720,16 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
|
|
||||||
// Check if the file already exists locally
|
// Check if the file already exists locally
|
||||||
struct stat model_file_info;
|
struct stat model_file_info;
|
||||||
auto file_exists = (stat(path_model, &model_file_info) == 0);
|
auto file_exists = (stat(path, &model_file_info) == 0);
|
||||||
|
|
||||||
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
|
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
|
||||||
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
char etag_path[PATH_MAX] = {0};
|
||||||
snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
|
snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
|
||||||
|
|
||||||
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
char last_modified_path[PATH_MAX] = {0};
|
||||||
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
|
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
|
||||||
|
|
||||||
if (file_exists) {
|
if (file_exists) {
|
||||||
auto * f_etag = fopen(etag_path, "r");
|
auto * f_etag = fopen(etag_path, "r");
|
||||||
@ -1748,7 +1737,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (!fgets(etag, sizeof(etag), f_etag)) {
|
if (!fgets(etag, sizeof(etag), f_etag)) {
|
||||||
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
|
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
|
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
|
||||||
}
|
}
|
||||||
fclose(f_etag);
|
fclose(f_etag);
|
||||||
}
|
}
|
||||||
@ -1758,7 +1747,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
|
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
|
||||||
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
|
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
|
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
|
||||||
last_modified);
|
last_modified);
|
||||||
}
|
}
|
||||||
fclose(f_last_modified);
|
fclose(f_last_modified);
|
||||||
@ -1776,6 +1765,11 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
|
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
|
||||||
|
|
||||||
|
// Convert header field name to lowercase
|
||||||
|
for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
|
||||||
|
buffer[i] = tolower(buffer[i]);
|
||||||
|
}
|
||||||
|
|
||||||
const char * etag_prefix = "etag: ";
|
const char * etag_prefix = "etag: ";
|
||||||
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
|
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
|
||||||
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
|
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
|
||||||
@ -1798,7 +1792,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (res != CURLE_OK) {
|
if (res != CURLE_OK) {
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||||
return NULL;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
long http_code = 0;
|
long http_code = 0;
|
||||||
@ -1806,30 +1800,34 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (http_code != 200) {
|
if (http_code != 200) {
|
||||||
// HEAD not supported, we don't know if the file has changed
|
// HEAD not supported, we don't know if the file has changed
|
||||||
// force trigger downloading
|
// force trigger downloading
|
||||||
file_exists = false;
|
force_download = true;
|
||||||
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the ETag or the Last-Modified headers are different: trigger a new download
|
// If the ETag or the Last-Modified headers are different: trigger a new download
|
||||||
if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
|
bool should_download = !file_exists
|
||||||
char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
|| force_download
|
||||||
snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
|
|| (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
|
||||||
|
|| (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
|
||||||
|
if (should_download) {
|
||||||
|
char path_temporary[PATH_MAX] = {0};
|
||||||
|
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
|
||||||
if (file_exists) {
|
if (file_exists) {
|
||||||
fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
|
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
|
||||||
if (remove(path_model) != 0) {
|
if (remove(path) != 0) {
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
|
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
|
||||||
return NULL;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the output file
|
// Set the output file
|
||||||
auto * outfile = fopen(path_model_temporary, "wb");
|
auto * outfile = fopen(path_temporary, "wb");
|
||||||
if (!outfile) {
|
if (!outfile) {
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
|
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
|
||||||
return NULL;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
||||||
@ -1843,15 +1841,30 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
// display download progress
|
// display download progress
|
||||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||||
|
|
||||||
|
// helper function to hide password in URL
|
||||||
|
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||||
|
std::size_t protocol_pos = url.find("://");
|
||||||
|
if (protocol_pos == std::string::npos) {
|
||||||
|
return url; // Malformed URL
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
||||||
|
if (at_pos == std::string::npos) {
|
||||||
|
return url; // No password in URL
|
||||||
|
}
|
||||||
|
|
||||||
|
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
||||||
|
};
|
||||||
|
|
||||||
// start the download
|
// start the download
|
||||||
fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||||
model_url, path_model, headers.etag, headers.last_modified);
|
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
|
||||||
auto res = curl_easy_perform(curl);
|
auto res = curl_easy_perform(curl);
|
||||||
if (res != CURLE_OK) {
|
if (res != CURLE_OK) {
|
||||||
fclose(outfile);
|
fclose(outfile);
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||||
return NULL;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
long http_code = 0;
|
long http_code = 0;
|
||||||
@ -1860,7 +1873,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
fclose(outfile);
|
fclose(outfile);
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
|
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||||
return NULL;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up
|
// Clean up
|
||||||
@ -1872,7 +1885,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (etag_file) {
|
if (etag_file) {
|
||||||
fputs(headers.etag, etag_file);
|
fputs(headers.etag, etag_file);
|
||||||
fclose(etag_file);
|
fclose(etag_file);
|
||||||
fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
|
fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1882,20 +1895,118 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
if (last_modified_file) {
|
if (last_modified_file) {
|
||||||
fputs(headers.last_modified, last_modified_file);
|
fputs(headers.last_modified, last_modified_file);
|
||||||
fclose(last_modified_file);
|
fclose(last_modified_file);
|
||||||
fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
|
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
|
||||||
headers.last_modified);
|
headers.last_modified);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rename(path_model_temporary, path_model) != 0) {
|
if (rename(path_temporary, path) != 0) {
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
|
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(
|
||||||
|
const char * model_url,
|
||||||
|
const char * path_model,
|
||||||
|
const struct llama_model_params & params) {
|
||||||
|
// Basic validation of the model_url
|
||||||
|
if (!model_url || strlen(model_url) == 0) {
|
||||||
|
fprintf(stderr, "%s: invalid model_url\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize libcurl
|
||||||
|
auto * curl = curl_easy_init();
|
||||||
|
|
||||||
|
if (!curl) {
|
||||||
|
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!curl) {
|
||||||
|
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!llama_download_file(curl, model_url, path_model)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for additional GGUFs split to download
|
||||||
|
int n_split = 0;
|
||||||
|
{
|
||||||
|
struct gguf_init_params gguf_params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ NULL,
|
||||||
|
};
|
||||||
|
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
||||||
|
if (!ctx_gguf) {
|
||||||
|
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||||
|
if (key_n_split >= 0) {
|
||||||
|
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
||||||
|
}
|
||||||
|
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
|
if (n_split > 1) {
|
||||||
|
char split_prefix[PATH_MAX] = {0};
|
||||||
|
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||||
|
|
||||||
|
// Verify the first split file format
|
||||||
|
// and extract split URL and PATH prefixes
|
||||||
|
{
|
||||||
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||||
|
fprintf(stderr, "\n%s: unexpected model file name: %s"
|
||||||
|
" n_split=%d\n", __func__, path_model, n_split);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||||
|
fprintf(stderr, "\n%s: unexpected model url: %s"
|
||||||
|
" n_split=%d\n", __func__, model_url, n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepare download in parallel
|
||||||
|
std::vector<std::future<bool>> futures_download;
|
||||||
|
for (int idx = 1; idx < n_split; idx++) {
|
||||||
|
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
|
||||||
|
char split_path[PATH_MAX] = {0};
|
||||||
|
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
||||||
|
|
||||||
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||||
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||||
|
|
||||||
|
auto * curl = curl_easy_init();
|
||||||
|
bool res = llama_download_file(curl, split_url, split_path);
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}, idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all downloads to complete
|
||||||
|
for (auto & f : futures_download) {
|
||||||
|
if (!f.get()) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(path_model, params);
|
return llama_load_model_from_file(path_model, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -306,3 +306,10 @@ struct llama_control_vector_load_info {
|
|||||||
// Load control vectors, scale each by strength, and add them together.
|
// Load control vectors, scale each by strength, and add them together.
|
||||||
// On error, returns {-1, empty}
|
// On error, returns {-1, empty}
|
||||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Split utils
|
||||||
|
//
|
||||||
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
@ -26,10 +26,6 @@ enum split_operation : uint8_t {
|
|||||||
SPLIT_OP_MERGE,
|
SPLIT_OP_MERGE,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
||||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
||||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
||||||
|
|
||||||
struct split_params {
|
struct split_params {
|
||||||
split_operation operation = SPLIT_OP_SPLIT;
|
split_operation operation = SPLIT_OP_SPLIT;
|
||||||
int n_split_tensors = 128;
|
int n_split_tensors = 128;
|
||||||
|
@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
||||||
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
|
||||||
|
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
|
||||||
|
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
|
@ -2208,7 +2208,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||||
printf(" model download url (default: %s)\n", params.model_url.c_str());
|
printf(" model download url (default: unused)\n");
|
||||||
|
printf(" -hfr REPO, --hf-repo REPO\n");
|
||||||
|
printf(" Hugging Face model repository (default: unused)\n");
|
||||||
|
printf(" -hff FILE, --hf-file FILE\n");
|
||||||
|
printf(" Hugging Face model file (default: unused)\n");
|
||||||
printf(" -a ALIAS, --alias ALIAS\n");
|
printf(" -a ALIAS, --alias ALIAS\n");
|
||||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
@ -2337,6 +2341,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.model_url = argv[i];
|
params.model_url = argv[i];
|
||||||
|
} else if (arg == "-hfr" || arg == "--hf-repo") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.hf_repo = argv[i];
|
||||||
|
} else if (arg == "-hff" || arg == "--hf-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.hf_file = argv[i];
|
||||||
} else if (arg == "-a" || arg == "--alias") {
|
} else if (arg == "-a" || arg == "--alias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -4,7 +4,8 @@ Feature: Parallel
|
|||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||||
|
And a model file test-model-00001-of-00003.gguf
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 128 as batch size
|
And 128 as batch size
|
||||||
And 256 KV cache size
|
And 256 KV cache size
|
||||||
|
@ -4,8 +4,8 @@ Feature: llama.cpp server
|
|||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
And a model file stories260K.gguf
|
And a model file test-model.gguf
|
||||||
And a model alias tinyllama-2
|
And a model alias tinyllama-2
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
# KV Cache corresponds to the total amount of tokens
|
# KV Cache corresponds to the total amount of tokens
|
||||||
|
@ -16,7 +16,6 @@ import numpy as np
|
|||||||
import openai
|
import openai
|
||||||
from behave import step
|
from behave import step
|
||||||
from behave.api.async_step import async_run_until_complete
|
from behave.api.async_step import async_run_until_complete
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
from prometheus_client import parser
|
from prometheus_client import parser
|
||||||
|
|
||||||
|
|
||||||
@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port):
|
|||||||
|
|
||||||
context.model_alias = None
|
context.model_alias = None
|
||||||
context.model_file = None
|
context.model_file = None
|
||||||
|
context.model_hf_repo = None
|
||||||
|
context.model_hf_file = None
|
||||||
context.model_url = None
|
context.model_url = None
|
||||||
context.n_batch = None
|
context.n_batch = None
|
||||||
context.n_ubatch = None
|
context.n_ubatch = None
|
||||||
@ -68,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port):
|
|||||||
|
|
||||||
@step('a model file {hf_file} from HF repo {hf_repo}')
|
@step('a model file {hf_file} from HF repo {hf_repo}')
|
||||||
def step_download_hf_model(context, hf_file, hf_repo):
|
def step_download_hf_model(context, hf_file, hf_repo):
|
||||||
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
|
context.model_hf_repo = hf_repo
|
||||||
if context.debug:
|
context.model_hf_file = hf_file
|
||||||
print(f"model file: {context.model_file}")
|
context.model_file = os.path.basename(hf_file)
|
||||||
|
|
||||||
|
|
||||||
@step('a model file {model_file}')
|
@step('a model file {model_file}')
|
||||||
@ -1079,6 +1080,10 @@ def start_server_background(context):
|
|||||||
server_args.extend(['--model', context.model_file])
|
server_args.extend(['--model', context.model_file])
|
||||||
if context.model_url:
|
if context.model_url:
|
||||||
server_args.extend(['--model-url', context.model_url])
|
server_args.extend(['--model-url', context.model_url])
|
||||||
|
if context.model_hf_repo:
|
||||||
|
server_args.extend(['--hf-repo', context.model_hf_repo])
|
||||||
|
if context.model_hf_file:
|
||||||
|
server_args.extend(['--hf-file', context.model_hf_file])
|
||||||
if context.n_batch:
|
if context.n_batch:
|
||||||
server_args.extend(['--batch-size', context.n_batch])
|
server_args.extend(['--batch-size', context.n_batch])
|
||||||
if context.n_ubatch:
|
if context.n_ubatch:
|
||||||
|
@ -2959,7 +2959,7 @@ struct llama_model_loader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
|
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
n_kv = gguf_get_n_kv(meta);
|
n_kv = gguf_get_n_kv(meta);
|
||||||
@ -15140,7 +15140,7 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|||||||
// check if dest ends with postfix
|
// check if dest ends with postfix
|
||||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||||
snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path);
|
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||||
return size_prefix;
|
return size_prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user