From 8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Tue, 30 Apr 2024 00:52:50 +0100 Subject: [PATCH] Improve usability of --model-url & related flags (#6930) * args: default --model to models/ + filename from --model-url or --hf-file (or else legacy models/7B/ggml-model-f16.gguf) * args: main & server now call gpt_params_handle_model_default * args: define DEFAULT_MODEL_PATH + update cli docs * curl: check url of previous download (.json metadata w/ url, etag & lastModified) * args: fix update to quantize-stats.cpp * curl: support legacy .etag / .lastModified companion files * curl: rm legacy .etag file support * curl: reuse regex across headers callback calls * curl: unique_ptr to manage lifecycle of curl & outfile * curl: nit: no need for multiline regex flag * curl: update failed test (model file collision) + gitignore *.gguf.json --- .gitignore | 1 + common/common.cpp | 258 +++++++++--------- common/common.h | 6 +- examples/main/README.md | 2 +- examples/quantize-stats/quantize-stats.cpp | 2 +- examples/server/server.cpp | 4 +- .../server/tests/features/embeddings.feature | 2 +- 7 files changed, 143 insertions(+), 132 deletions(-) diff --git a/.gitignore b/.gitignore index 60f9d1f8d..50ae0973a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.a *.so *.gguf +*.gguf.json *.bin *.exe *.dll diff --git a/common/common.cpp b/common/common.cpp index fe84039f7..099d0356f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -67,7 +67,6 @@ #include #endif #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 -#define LLAMA_CURL_MAX_HEADER_LENGTH 256 #endif // LLAMA_USE_CURL using json = nlohmann::ordered_json; @@ -1324,6 +1323,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } +void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = "models/" + string_split(params.hf_file, '/').back(); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + f = string_split(f, '/').back(); + params.model = "models/" + f; + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -1352,10 +1374,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - // short-hand to avoid specifying --hf-file -> default it to --model - if (!params.hf_repo.empty() && params.hf_file.empty()) { - params.hf_file = params.model; - } + gpt_params_handle_model_default(params); if (params.escape) { process_escapes(params.prompt); @@ -1548,7 +1567,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --control-vector-layer-range START END\n"); printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", params.model.c_str()); + printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); printf(" -md FNAME, --model-draft FNAME\n"); printf(" draft model for speculative decoding (default: unused)\n"); printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); @@ -1896,59 +1915,75 @@ void llama_batch_add( #ifdef LLAMA_USE_CURL -static bool llama_download_file(CURL * curl, const char * url, const char * path) { +static bool starts_with(const std::string & str, const std::string & prefix) { + // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; +} + +static bool llama_download_file(const std::string & url, const std::string & path) { + + // Initialize libcurl + std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return false; + } + bool force_download = false; // Set the URL, allow to follow http redirection - curl_easy_setopt(curl, CURLOPT_URL, url); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // operating system. Currently implemented under MS-Windows. - curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); #endif // Check if the file already exists locally struct stat model_file_info; - auto file_exists = (stat(path, &model_file_info) == 0); + auto file_exists = (stat(path.c_str(), &model_file_info) == 0); - // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files - char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char etag_path[PATH_MAX] = {0}; - snprintf(etag_path, sizeof(etag_path), "%s.etag", path); - - char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified_path[PATH_MAX] = {0}; - snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path); + // If the file exists, check its JSON metadata companion file. + std::string metadata_path = path + ".json"; + nlohmann::json metadata; + std::string etag; + std::string last_modified; if (file_exists) { - auto * f_etag = fopen(etag_path, "r"); - if (f_etag) { - if (!fgets(etag, sizeof(etag), f_etag)) { - fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); - } else { - fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); + // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). + std::ifstream metadata_in(metadata_path); + if (metadata_in.good()) { + try { + metadata_in >> metadata; + fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + if (metadata.contains("url") && metadata["url"].is_string()) { + auto previous_url = metadata["url"].get(); + if (previous_url != url) { + fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + return false; + } + } + if (metadata.contains("etag") && metadata["etag"].is_string()) { + etag = metadata["etag"]; + } + if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) { + last_modified = metadata["lastModified"]; + } + } catch (const nlohmann::json::exception & e) { + fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + return false; } - fclose(f_etag); - } - - auto * f_last_modified = fopen(last_modified_path, "r"); - if (f_last_modified) { - if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { - fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); - } else { - fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path, - last_modified); - } - fclose(f_last_modified); } + } else { + fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers struct llama_load_model_from_url_headers { - char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + std::string etag; + std::string last_modified; }; llama_load_model_from_url_headers headers; { @@ -1956,38 +1991,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; - // Convert header field name to lowercase - for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { - buffer[i] = tolower(buffer[i]); - } + static std::regex header_regex("([^:]+): (.*)\r\n"); + static std::regex etag_regex("ETag", std::regex_constants::icase); + static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); - const char * etag_prefix = "etag: "; - if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { - strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF - } - - const char * last_modified_prefix = "last-modified: "; - if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) { - strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), - n_items - strlen(last_modified_prefix) - 2); // Remove CRLF + std::string header(buffer, n_items); + std::smatch match; + if (std::regex_match(header, match, header_regex)) { + const std::string & key = match[1]; + const std::string & value = match[2]; + if (std::regex_match(key, match, etag_regex)) { + headers->etag = value; + } else if (std::regex_match(key, match, last_modified_regex)) { + headers->last_modified = value; + } } return n_items; }; - curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback)); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - CURLcode res = curl_easy_perform(curl); + CURLcode res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); return false; } long http_code = 0; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code != 200) { // HEAD not supported, we don't know if the file has changed // force trigger downloading @@ -1996,28 +2030,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path } } - // If the ETag or the Last-Modified headers are different: trigger a new download - bool should_download = !file_exists - || force_download - || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) - || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); + bool should_download = !file_exists || force_download; + if (!should_download) { + if (!etag.empty() && etag != headers.etag) { + fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + should_download = true; + } else if (!last_modified.empty() && last_modified != headers.last_modified) { + fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + should_download = true; + } + } if (should_download) { - char path_temporary[PATH_MAX] = {0}; - snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path); + std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); - if (remove(path) != 0) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path); + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + if (remove(path.c_str()) != 0) { + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); return false; } } // Set the output file - auto * outfile = fopen(path_temporary, "wb"); + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb"), fclose); if (!outfile) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path); + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; } @@ -2025,12 +2061,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { return fwrite(data, size, nmemb, (FILE *)fd); }; - curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); // display download progress - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); // helper function to hide password in URL auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { @@ -2049,51 +2085,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path // start the download fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); - auto res = curl_easy_perform(curl); + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + auto res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - fclose(outfile); - curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); return false; } long http_code = 0; - curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - fclose(outfile); - curl_easy_cleanup(curl); fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); return false; } - // Clean up - fclose(outfile); + // Causes file to be closed explicitly here before we rename it. + outfile.reset(); - // Write the new ETag to the .etag file - if (strlen(headers.etag) > 0) { - auto * etag_file = fopen(etag_path, "w"); - if (etag_file) { - fputs(headers.etag, etag_file); - fclose(etag_file); - fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); - } - } + // Write the updated JSON metadata file. + metadata.update({ + {"url", url}, + {"etag", headers.etag}, + {"lastModified", headers.last_modified} + }); + std::ofstream(metadata_path) << metadata.dump(4); + fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); - // Write the new lastModified to the .etag file - if (strlen(headers.last_modified) > 0) { - auto * last_modified_file = fopen(last_modified_path, "w"); - if (last_modified_file) { - fputs(headers.last_modified, last_modified_file); - fclose(last_modified_file); - fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path, - headers.last_modified); - } - } - - if (rename(path_temporary, path) != 0) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path); + if (rename(path_temporary.c_str(), path.c_str()) != 0) { + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } } @@ -2111,15 +2130,7 @@ struct llama_model * llama_load_model_from_url( return NULL; } - // Initialize libcurl - auto * curl = curl_easy_init(); - - if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); - return NULL; - } - - if (!llama_download_file(curl, model_url, path_model)) { + if (!llama_download_file(model_url, path_model)) { return NULL; } @@ -2133,7 +2144,6 @@ struct llama_model * llama_load_model_from_url( auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); if (!ctx_gguf) { fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); - curl_easy_cleanup(curl); return NULL; } @@ -2145,8 +2155,6 @@ struct llama_model * llama_load_model_from_url( gguf_free(ctx_gguf); } - curl_easy_cleanup(curl); - if (n_split > 1) { char split_prefix[PATH_MAX] = {0}; char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; @@ -2177,11 +2185,7 @@ struct llama_model * llama_load_model_from_url( char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - auto * curl = curl_easy_init(); - bool res = llama_download_file(curl, split_url, split_path); - curl_easy_cleanup(curl); - - return res; + return llama_download_file(split_url, split_path); }, idx)); } @@ -2668,7 +2672,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); + fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); diff --git a/common/common.h b/common/common.h index 3233d90e6..8afdf2bdf 100644 --- a/common/common.h +++ b/common/common.h @@ -31,6 +31,8 @@ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" + // build info extern int LLAMA_BUILD_NUMBER; extern char const *LLAMA_COMMIT; @@ -92,7 +94,7 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model = ""; // model path std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string model_url = ""; // model url to download @@ -171,6 +173,8 @@ struct gpt_params { std::vector image; // path to image file(s) }; +void gpt_params_handle_model_default(gpt_params & params); + bool parse_kv_override(const char * data, std::vector & overrides); bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/README.md b/examples/main/README.md index 649f4e0f3..e7a38743c 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 1d05f1391..746df8446 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -23,7 +23,7 @@ #endif struct quantize_stats_params { - std::string model = "models/7B/ggml-model-f16.gguf"; + std::string model = DEFAULT_MODEL_PATH; bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2760aea8f..01453af2c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2353,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" disable KV offload\n"); } printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", params.model.c_str()); + printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); printf(" model download url (default: unused)\n"); printf(" -hfr REPO, --hf-repo REPO\n"); @@ -2835,6 +2835,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } } + gpt_params_handle_model_default(params); + if (!params.kv_overrides.empty()) { params.kv_overrides.emplace_back(); params.kv_overrides.back().key[0] = 0; diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index dcf1434f9..6f163ce04 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -5,7 +5,7 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf - And a model file ggml-model-f16.gguf + And a model file bert-bge-small.gguf And a model alias bert-bge-small And 42 as server seed And 2 slots