common: llama_load_model_from_url using --model-url (#6098)

* common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-01-11 21:10:24 +01:00 · 2024-03-17 19:12:37 +01:00 · 2024-03-17 19:12:37 +01:00 · d01b3c4c32
commit d01b3c4c32
parent cd776c37c9
16 changed files with 397 additions and 55 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -48,6 +48,28 @@ jobs:
          CC=gcc-8 make tests -j $(nproc)
          make test -j $(nproc)
  ubuntu-focal-make-curl:
    runs-on: ubuntu-20.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
      - name: Build
        id: make_build
        env:
          LLAMA_FATAL_WARNINGS: 1
          LLAMA_CURL: 1
        run: |
          CC=gcc-8 make -j $(nproc)
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -57,7 +57,8 @@ jobs:
            cmake \
            python3-pip \
            wget \
-            language-pack-en
+            language-pack-en \
            libcurl4-openssl-dev
      - name: Build
        id: cmake_build
@ -67,6 +68,7 @@ jobs:
          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
@ -101,12 +103,21 @@ jobs:
        with:
          fetch-depth: 0
      - name: libCURL
        id: get_libcurl
        env:
          CURL_VERSION: 8.6.0_6
        run: |
          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
          mkdir $env:RUNNER_TEMP/libcurl
          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake ..  -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
      - name: Python setup
@ -120,6 +131,11 @@ jobs:
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Copy Libcurl
        id: prepare_libcurl
        run: |
          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
--- a/5
+++ b/5
@ -595,6 +595,11 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 ifdef LLAMA_CURL
 override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
 override LDFLAGS  := $(LDFLAGS) -lcurl
 endif
 #
 # Print build information
 #
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 set(LLAMA_COMMON_EXTRA_LIBS build_info)
 # Use curl to download model url
 if (LLAMA_CURL)
    find_package(CURL REQUIRED)
    add_definitions(-DLLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -37,6 +37,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -50,6 +53,18 @@
 #define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
 #define PATH_MAX MAX_PATH
 #else
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@ -644,6 +659,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            params.model = argv[i];
        }
        if (arg == "-mu" || arg == "--model-url") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.model_url = argv[i];
        }
        if (arg == "-md" || arg == "--model-draft") {
            arg_found = true;
            if (++i >= argc) {
@ -1368,6 +1390,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
    printf("                        model download url (default: %s)\n", params.model_url.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
    printf("                        draft model for speculative decoding\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
@ -1613,10 +1637,222 @@ void llama_batch_add(
    batch.n_tokens++;
 }
 #ifdef LLAMA_USE_CURL
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                              struct llama_model_params params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
        fprintf(stderr, "%s: invalid model_url\n", __func__);
        return NULL;
    }
    // Initialize libcurl globally
    auto curl = curl_easy_init();
    if (!curl) {
        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
        return NULL;
    }
    // Set the URL, allow to follow http redirection
    curl_easy_setopt(curl, CURLOPT_URL, model_url);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif
    // Check if the file already exists locally
    struct stat model_file_info;
    auto file_exists = (stat(path_model, &model_file_info) == 0);
    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
    if (file_exists) {
        auto * f_etag = fopen(etag_path, "r");
        if (f_etag) {
            if (!fgets(etag, sizeof(etag), f_etag)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
            } else {
                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
            }
            fclose(f_etag);
        }
        auto * f_last_modified = fopen(last_modified_path, "r");
        if (f_last_modified) {
            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
            } else {
                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
                        last_modified);
            }
            fclose(f_last_modified);
        }
    }
    // Send a HEAD request to retrieve the etag and last-modified headers
    struct llama_load_model_from_url_headers {
        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
    };
    llama_load_model_from_url_headers headers;
    {
        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
            const char * etag_prefix = "etag: ";
            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
            }
            const char * last_modified_prefix = "last-modified: ";
            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
                        n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
            }
            return n_items;
        };
        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
        CURLcode res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return NULL;
        }
        long http_code = 0;
        curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
            file_exists = false;
            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }
    // If the ETag or the Last-Modified headers are different: trigger a new download
    if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
        char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
        snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
        if (file_exists) {
            fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
            if (remove(path_model) != 0) {
                curl_easy_cleanup(curl);
                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
                return NULL;
            }
        }
        // Set the output file
        auto * outfile = fopen(path_model_temporary, "wb");
        if (!outfile) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
            return NULL;
        }
        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
            return fwrite(data, size, nmemb, (FILE *)fd);
        };
        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
        //  display download progress
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
        // start the download
        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
                model_url, path_model, headers.etag, headers.last_modified);
        auto res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return NULL;
        }
        long http_code = 0;
        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code < 200 || http_code >= 400) {
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
            return NULL;
        }
        // Clean up
        fclose(outfile);
        // Write the new ETag to the .etag file
        if (strlen(headers.etag) > 0) {
            auto * etag_file = fopen(etag_path, "w");
            if (etag_file) {
                fputs(headers.etag, etag_file);
                fclose(etag_file);
                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
            }
        }
        // Write the new lastModified to the .etag file
        if (strlen(headers.last_modified) > 0) {
            auto * last_modified_file = fopen(last_modified_path, "w");
            if (last_modified_file) {
                fputs(headers.last_modified, last_modified_file);
                fclose(last_modified_file);
                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
                        headers.last_modified);
            }
        }
        if (rename(path_model_temporary, path_model) != 0) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
            return NULL;
        }
    }
    curl_easy_cleanup(curl);
    return llama_load_model_from_file(path_model, params);
 }
 #else
 struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
                                              struct llama_model_params /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
 }
 #endif // LLAMA_USE_CURL
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);
-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = nullptr;
    if (!params.model_url.empty()) {
        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
--- a/common/common.h
+++ b/common/common.h
@ -89,6 +89,7 @@ struct gpt_params {
    struct llama_sampling_params sparams;
    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_url         = ""; // model url to download
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
@ -191,6 +192,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                                         struct llama_model_params     params);
 // Batch utils
 void llama_batch_clear(struct llama_batch & batch);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    }
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
    printf("                            model download url (default: %s)\n", params.model_url.c_str());
    printf("  -a ALIAS, --alias ALIAS\n");
    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.model = argv[i];
        } else if (arg == "-mu" || arg == "--model-url") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.model_url = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
 To run a scenario annotated with `@bug`, start:
 ```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug
+DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
 ```
 After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -4,7 +4,8 @@ Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
    And   a model file ggml-model-f16.gguf
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@ -1,10 +1,12 @@
 import errno
 import os
 import socket
 import subprocess
 import time
 from contextlib import closing
 import signal
 import socket
 import sys
 import time
 import traceback
 from contextlib import closing
 import psutil
 def before_scenario(context, scenario):
@ -20,33 +22,40 @@ def before_scenario(context, scenario):
 def after_scenario(context, scenario):
-    if context.server_process is None:
+    try:
-        return
+        if 'server_process' not in context or context.server_process is None:
-    if scenario.status == "failed":
+            return
-        if 'GITHUB_ACTIONS' in os.environ:
+        if scenario.status == "failed":
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
+            if 'GITHUB_ACTIONS' in os.environ:
-            if os.path.isfile('llama.log'):
+                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-                with closing(open('llama.log', 'r')) as f:
+                if os.path.isfile('llama.log'):
-                    for line in f:
+                    with closing(open('llama.log', 'r')) as f:
-                        print(line)
+                        for line in f:
-        if not is_server_listening(context.server_fqdn, context.server_port):
+                            print(line)
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
+            if not is_server_listening(context.server_fqdn, context.server_port):
                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
-    if not pid_exists(context.server_process.pid):
+        if not pid_exists(context.server_process.pid):
-        assert False, f"Server not running pid={context.server_process.pid} ..."
+            assert False, f"Server not running pid={context.server_process.pid} ..."
-    server_graceful_shutdown(context)
+        server_graceful_shutdown(context)
-    # Wait few for socket to free up
+        # Wait few for socket to free up
-    time.sleep(0.05)
+        time.sleep(0.05)
-    attempts = 0
+        attempts = 0
-    while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
+        while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
-        server_kill(context)
+            server_kill(context)
-        time.sleep(0.1)
+            time.sleep(0.1)
-        attempts += 1
+            attempts += 1
-        if attempts > 5:
+            if attempts > 5:
-            server_kill_hard(context)
+                server_kill_hard(context)
    except:
        exc = sys.exception()
        print("error in after scenario: \n")
        print(exc)
        print("*** print_tb: \n")
        traceback.print_tb(exc.__traceback__, file=sys.stdout)
 def server_graceful_shutdown(context):
@ -67,11 +76,11 @@ def server_kill_hard(context):
    path = context.server_path
    print(f"Server dangling exits, hard killing force {pid}={path}...\n")
-    if os.name == 'nt':
+    try:
-        process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
+        psutil.Process(pid).kill()
-        print(process)
+    except psutil.NoSuchProcess:
-    else:
+        return False
-        os.kill(-pid, signal.SIGKILL)
+    return True
 def is_server_listening(server_fqdn, server_port):
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):
 def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
+    try:
-    if pid < 0:
+        psutil.Process(pid)
    except psutil.NoSuchProcess:
        return False
-    if os.name == 'nt':
+    return True
-        output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
+
        print(output)
        return "No tasks are running" not in output
    else:
        try:
            os.kill(pid, 0)
        except OSError as e:
            return e.errno == errno.EPERM
        else:
            return True
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -4,7 +4,8 @@ Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
    And   a model file stories260K.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -5,6 +5,8 @@ import os
 import re
 import socket
 import subprocess
 import sys
 import threading
 import time
 from contextlib import closing
 from re import RegexFlag
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
    context.model_alias = None
    context.model_file = None
    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
    context.n_ctx = None
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
        print(f"model file: {context.model_file}\n")
@step('a model file {model_file}')
 def step_model_file(context, model_file):
    context.model_file = model_file
@step('a model url {model_url}')
 def step_model_url(context, model_url):
    context.model_url = model_url
@step('a model alias {model_alias}')
 def step_model_alias(context, model_alias):
    context.model_alias = model_alias
@ -141,7 +155,8 @@ def step_start_server(context):
 async def step_wait_for_the_server_to_be_started(context, expecting_status):
    match expecting_status:
        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
+            await wait_for_health_status(context, context.base_url, 200, 'ok',
                                         timeout=30)
        case 'ready' | 'idle':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -1038,8 +1053,11 @@ def start_server_background(context):
    server_args = [
        '--host', server_listen_addr,
        '--port', context.server_port,
        '--model', context.model_file
    ]
    if context.model_file:
        server_args.extend(['--model', context.model_file])
    if context.model_url:
        server_args.extend(['--model-url', context.model_url])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
@ -1079,8 +1097,23 @@ def start_server_background(context):
    pkwargs = {
        'creationflags': flags,
        'stdout': subprocess.PIPE,
        'stderr': subprocess.PIPE
    }
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)
    def log_stdout(process):
        for line in iter(process.stdout.readline, b''):
            print(line.decode('utf-8'), end='')
    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
    thread_stdout.start()
    def log_stderr(process):
        for line in iter(process.stderr.readline, b''):
            print(line.decode('utf-8'), end='', file=sys.stderr)
    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
    thread_stderr.start()
    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -3,4 +3,5 @@ behave~=1.2.6
 huggingface_hub~=0.20.3
 numpy~=1.24.4
 openai~=0.25.0
 psutil~=5.9.8
 prometheus-client~=0.20.0