mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-10-29 22:20:15 +01:00
common: llama_load_model_from_url using --model-url (#6098)
* common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
cd776c37c9
commit
d01b3c4c32
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
@ -48,6 +48,28 @@ jobs:
|
|||||||
CC=gcc-8 make tests -j $(nproc)
|
CC=gcc-8 make tests -j $(nproc)
|
||||||
make test -j $(nproc)
|
make test -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-focal-make-curl:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
|
env:
|
||||||
|
LLAMA_FATAL_WARNINGS: 1
|
||||||
|
LLAMA_CURL: 1
|
||||||
|
run: |
|
||||||
|
CC=gcc-8 make -j $(nproc)
|
||||||
|
|
||||||
ubuntu-latest-cmake:
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
20
.github/workflows/server.yml
vendored
20
.github/workflows/server.yml
vendored
@ -57,7 +57,8 @@ jobs:
|
|||||||
cmake \
|
cmake \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en
|
language-pack-en \
|
||||||
|
libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
@ -67,6 +68,7 @@ jobs:
|
|||||||
cmake .. \
|
cmake .. \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DLLAMA_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||||
@ -101,12 +103,21 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: libCURL
|
||||||
|
id: get_libcurl
|
||||||
|
env:
|
||||||
|
CURL_VERSION: 8.6.0_6
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
||||||
|
mkdir $env:RUNNER_TEMP/libcurl
|
||||||
|
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
|
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
@ -120,6 +131,11 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
|
- name: Copy Libcurl
|
||||||
|
id: prepare_libcurl
|
||||||
|
run: |
|
||||||
|
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
|
@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
|||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
"llama: max. batch size for using peer access")
|
"llama: max. batch size for using peer access")
|
||||||
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
5
Makefile
5
Makefile
@ -595,6 +595,11 @@ include scripts/get-flags.mk
|
|||||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_CURL
|
||||||
|
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
||||||
|
override LDFLAGS := $(LDFLAGS) -lcurl
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Print build information
|
# Print build information
|
||||||
#
|
#
|
||||||
|
@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS)
|
|||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||||
|
|
||||||
|
# Use curl to download model url
|
||||||
|
if (LLAMA_CURL)
|
||||||
|
find_package(CURL REQUIRED)
|
||||||
|
add_definitions(-DLLAMA_USE_CURL)
|
||||||
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
||||||
|
@ -37,6 +37,9 @@
|
|||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(LLAMA_USE_CURL)
|
||||||
|
#include <curl/curl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
@ -50,6 +53,18 @@
|
|||||||
#define GGML_USE_CUBLAS_SYCL_VULKAN
|
#define GGML_USE_CUBLAS_SYCL_VULKAN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(LLAMA_USE_CURL)
|
||||||
|
#ifdef __linux__
|
||||||
|
#include <linux/limits.h>
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
#define PATH_MAX MAX_PATH
|
||||||
|
#else
|
||||||
|
#include <sys/syslimits.h>
|
||||||
|
#endif
|
||||||
|
#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
|
||||||
|
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
||||||
|
#endif // LLAMA_USE_CURL
|
||||||
|
|
||||||
int32_t get_num_physical_cores() {
|
int32_t get_num_physical_cores() {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// enumerate the set of thread siblings, num entries is num cores
|
// enumerate the set of thread siblings, num entries is num cores
|
||||||
@ -644,6 +659,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = argv[i];
|
||||||
}
|
}
|
||||||
|
if (arg == "-mu" || arg == "--model-url") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_url = argv[i];
|
||||||
|
}
|
||||||
if (arg == "-md" || arg == "--model-draft") {
|
if (arg == "-md" || arg == "--model-draft") {
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
@ -1368,6 +1390,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
|
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
|
||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||||
|
printf(" model download url (default: %s)\n", params.model_url.c_str());
|
||||||
printf(" -md FNAME, --model-draft FNAME\n");
|
printf(" -md FNAME, --model-draft FNAME\n");
|
||||||
printf(" draft model for speculative decoding\n");
|
printf(" draft model for speculative decoding\n");
|
||||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
@ -1613,10 +1637,222 @@ void llama_batch_add(
|
|||||||
batch.n_tokens++;
|
batch.n_tokens++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_CURL
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
|
||||||
|
struct llama_model_params params) {
|
||||||
|
// Basic validation of the model_url
|
||||||
|
if (!model_url || strlen(model_url) == 0) {
|
||||||
|
fprintf(stderr, "%s: invalid model_url\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize libcurl globally
|
||||||
|
auto curl = curl_easy_init();
|
||||||
|
|
||||||
|
if (!curl) {
|
||||||
|
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the URL, allow to follow http redirection
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, model_url);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
#if defined(_WIN32)
|
||||||
|
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||||
|
// operating system. Currently implemented under MS-Windows.
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Check if the file already exists locally
|
||||||
|
struct stat model_file_info;
|
||||||
|
auto file_exists = (stat(path_model, &model_file_info) == 0);
|
||||||
|
|
||||||
|
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
|
||||||
|
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
|
char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
||||||
|
snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
|
||||||
|
|
||||||
|
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
|
char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
||||||
|
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
|
||||||
|
|
||||||
|
if (file_exists) {
|
||||||
|
auto * f_etag = fopen(etag_path, "r");
|
||||||
|
if (f_etag) {
|
||||||
|
if (!fgets(etag, sizeof(etag), f_etag)) {
|
||||||
|
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
|
||||||
|
}
|
||||||
|
fclose(f_etag);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * f_last_modified = fopen(last_modified_path, "r");
|
||||||
|
if (f_last_modified) {
|
||||||
|
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
|
||||||
|
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
|
||||||
|
last_modified);
|
||||||
|
}
|
||||||
|
fclose(f_last_modified);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||||
|
struct llama_load_model_from_url_headers {
|
||||||
|
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
|
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
|
||||||
|
};
|
||||||
|
llama_load_model_from_url_headers headers;
|
||||||
|
{
|
||||||
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||||
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
|
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
|
||||||
|
|
||||||
|
const char * etag_prefix = "etag: ";
|
||||||
|
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
|
||||||
|
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * last_modified_prefix = "last-modified: ";
|
||||||
|
if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
|
||||||
|
strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
|
||||||
|
n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
|
||||||
|
}
|
||||||
|
return n_items;
|
||||||
|
};
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||||
|
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
long http_code = 0;
|
||||||
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
|
||||||
|
if (http_code != 200) {
|
||||||
|
// HEAD not supported, we don't know if the file has changed
|
||||||
|
// force trigger downloading
|
||||||
|
file_exists = false;
|
||||||
|
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the ETag or the Last-Modified headers are different: trigger a new download
|
||||||
|
if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
|
||||||
|
char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
|
||||||
|
snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
|
||||||
|
if (file_exists) {
|
||||||
|
fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
|
||||||
|
if (remove(path_model) != 0) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the output file
|
||||||
|
auto * outfile = fopen(path_model_temporary, "wb");
|
||||||
|
if (!outfile) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
||||||
|
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
||||||
|
return fwrite(data, size, nmemb, (FILE *)fd);
|
||||||
|
};
|
||||||
|
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
|
||||||
|
|
||||||
|
// display download progress
|
||||||
|
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||||
|
|
||||||
|
// start the download
|
||||||
|
fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||||
|
model_url, path_model, headers.etag, headers.last_modified);
|
||||||
|
auto res = curl_easy_perform(curl);
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
fclose(outfile);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
long http_code = 0;
|
||||||
|
curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
|
||||||
|
if (http_code < 200 || http_code >= 400) {
|
||||||
|
fclose(outfile);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
fclose(outfile);
|
||||||
|
|
||||||
|
// Write the new ETag to the .etag file
|
||||||
|
if (strlen(headers.etag) > 0) {
|
||||||
|
auto * etag_file = fopen(etag_path, "w");
|
||||||
|
if (etag_file) {
|
||||||
|
fputs(headers.etag, etag_file);
|
||||||
|
fclose(etag_file);
|
||||||
|
fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the new lastModified to the .etag file
|
||||||
|
if (strlen(headers.last_modified) > 0) {
|
||||||
|
auto * last_modified_file = fopen(last_modified_path, "w");
|
||||||
|
if (last_modified_file) {
|
||||||
|
fputs(headers.last_modified, last_modified_file);
|
||||||
|
fclose(last_modified_file);
|
||||||
|
fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
|
||||||
|
headers.last_modified);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rename(path_model_temporary, path_model) != 0) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
|
return llama_load_model_from_file(path_model, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
|
||||||
|
struct llama_model_params /*params*/) {
|
||||||
|
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // LLAMA_USE_CURL
|
||||||
|
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = nullptr;
|
||||||
|
if (!params.model_url.empty()) {
|
||||||
|
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
|
||||||
|
} else {
|
||||||
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
}
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return std::make_tuple(nullptr, nullptr);
|
||||||
|
@ -89,6 +89,7 @@ struct gpt_params {
|
|||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
|
std::string model_url = ""; // model url to download
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
@ -191,6 +192,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
|
||||||
|
struct llama_model_params params);
|
||||||
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
|
||||||
void llama_batch_clear(struct llama_batch & batch);
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
|
|||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||||
|
@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
|
@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||||||
}
|
}
|
||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||||
|
printf(" model download url (default: %s)\n", params.model_url.c_str());
|
||||||
printf(" -a ALIAS, --alias ALIAS\n");
|
printf(" -a ALIAS, --alias ALIAS\n");
|
||||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = argv[i];
|
||||||
|
} else if (arg == "-mu" || arg == "--model-url") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_url = argv[i];
|
||||||
} else if (arg == "-a" || arg == "--alias") {
|
} else if (arg == "-a" || arg == "--alias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
|
|||||||
To run a scenario annotated with `@bug`, start:
|
To run a scenario annotated with `@bug`, start:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
DEBUG=ON ./tests.sh --no-skipped --tags bug
|
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
|
||||||
```
|
```
|
||||||
|
|
||||||
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
||||||
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
|||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
|
||||||
|
And a model file ggml-model-f16.gguf
|
||||||
And a model alias bert-bge-small
|
And a model alias bert-bge-small
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 2 slots
|
And 2 slots
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
import errno
|
|
||||||
import os
|
import os
|
||||||
import socket
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
from contextlib import closing
|
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
def before_scenario(context, scenario):
|
def before_scenario(context, scenario):
|
||||||
@ -20,33 +22,40 @@ def before_scenario(context, scenario):
|
|||||||
|
|
||||||
|
|
||||||
def after_scenario(context, scenario):
|
def after_scenario(context, scenario):
|
||||||
if context.server_process is None:
|
try:
|
||||||
return
|
if 'server_process' not in context or context.server_process is None:
|
||||||
if scenario.status == "failed":
|
return
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if scenario.status == "failed":
|
||||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
if os.path.isfile('llama.log'):
|
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||||
with closing(open('llama.log', 'r')) as f:
|
if os.path.isfile('llama.log'):
|
||||||
for line in f:
|
with closing(open('llama.log', 'r')) as f:
|
||||||
print(line)
|
for line in f:
|
||||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
print(line)
|
||||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||||
|
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
||||||
|
|
||||||
if not pid_exists(context.server_process.pid):
|
if not pid_exists(context.server_process.pid):
|
||||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||||
|
|
||||||
server_graceful_shutdown(context)
|
server_graceful_shutdown(context)
|
||||||
|
|
||||||
# Wait few for socket to free up
|
# Wait few for socket to free up
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
||||||
server_kill(context)
|
server_kill(context)
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
attempts += 1
|
attempts += 1
|
||||||
if attempts > 5:
|
if attempts > 5:
|
||||||
server_kill_hard(context)
|
server_kill_hard(context)
|
||||||
|
except:
|
||||||
|
exc = sys.exception()
|
||||||
|
print("error in after scenario: \n")
|
||||||
|
print(exc)
|
||||||
|
print("*** print_tb: \n")
|
||||||
|
traceback.print_tb(exc.__traceback__, file=sys.stdout)
|
||||||
|
|
||||||
|
|
||||||
def server_graceful_shutdown(context):
|
def server_graceful_shutdown(context):
|
||||||
@ -67,11 +76,11 @@ def server_kill_hard(context):
|
|||||||
path = context.server_path
|
path = context.server_path
|
||||||
|
|
||||||
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
||||||
if os.name == 'nt':
|
try:
|
||||||
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
|
psutil.Process(pid).kill()
|
||||||
print(process)
|
except psutil.NoSuchProcess:
|
||||||
else:
|
return False
|
||||||
os.kill(-pid, signal.SIGKILL)
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_server_listening(server_fqdn, server_port):
|
def is_server_listening(server_fqdn, server_port):
|
||||||
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):
|
|||||||
|
|
||||||
|
|
||||||
def pid_exists(pid):
|
def pid_exists(pid):
|
||||||
"""Check whether pid exists in the current process table."""
|
try:
|
||||||
if pid < 0:
|
psutil.Process(pid)
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
return False
|
return False
|
||||||
if os.name == 'nt':
|
return True
|
||||||
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
|
|
||||||
print(output)
|
|
||||||
return "No tasks are running" not in output
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
os.kill(pid, 0)
|
|
||||||
except OSError as e:
|
|
||||||
return e.errno == errno.EPERM
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
|||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
|
||||||
|
And a model file stories260K.gguf
|
||||||
And a model alias tinyllama-2
|
And a model alias tinyllama-2
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
# KV Cache corresponds to the total amount of tokens
|
# KV Cache corresponds to the total amount of tokens
|
||||||
|
@ -5,6 +5,8 @@ import os
|
|||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
|
|||||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||||
|
|
||||||
context.model_alias = None
|
context.model_alias = None
|
||||||
|
context.model_file = None
|
||||||
|
context.model_url = None
|
||||||
context.n_batch = None
|
context.n_batch = None
|
||||||
context.n_ubatch = None
|
context.n_ubatch = None
|
||||||
context.n_ctx = None
|
context.n_ctx = None
|
||||||
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
|
|||||||
print(f"model file: {context.model_file}\n")
|
print(f"model file: {context.model_file}\n")
|
||||||
|
|
||||||
|
|
||||||
|
@step('a model file {model_file}')
|
||||||
|
def step_model_file(context, model_file):
|
||||||
|
context.model_file = model_file
|
||||||
|
|
||||||
|
|
||||||
|
@step('a model url {model_url}')
|
||||||
|
def step_model_url(context, model_url):
|
||||||
|
context.model_url = model_url
|
||||||
|
|
||||||
|
|
||||||
@step('a model alias {model_alias}')
|
@step('a model alias {model_alias}')
|
||||||
def step_model_alias(context, model_alias):
|
def step_model_alias(context, model_alias):
|
||||||
context.model_alias = model_alias
|
context.model_alias = model_alias
|
||||||
@ -141,7 +155,8 @@ def step_start_server(context):
|
|||||||
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||||
match expecting_status:
|
match expecting_status:
|
||||||
case 'healthy':
|
case 'healthy':
|
||||||
await wait_for_health_status(context, context.base_url, 200, 'ok')
|
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||||
|
timeout=30)
|
||||||
|
|
||||||
case 'ready' | 'idle':
|
case 'ready' | 'idle':
|
||||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||||
@ -1038,8 +1053,11 @@ def start_server_background(context):
|
|||||||
server_args = [
|
server_args = [
|
||||||
'--host', server_listen_addr,
|
'--host', server_listen_addr,
|
||||||
'--port', context.server_port,
|
'--port', context.server_port,
|
||||||
'--model', context.model_file
|
|
||||||
]
|
]
|
||||||
|
if context.model_file:
|
||||||
|
server_args.extend(['--model', context.model_file])
|
||||||
|
if context.model_url:
|
||||||
|
server_args.extend(['--model-url', context.model_url])
|
||||||
if context.n_batch:
|
if context.n_batch:
|
||||||
server_args.extend(['--batch-size', context.n_batch])
|
server_args.extend(['--batch-size', context.n_batch])
|
||||||
if context.n_ubatch:
|
if context.n_ubatch:
|
||||||
@ -1079,8 +1097,23 @@ def start_server_background(context):
|
|||||||
|
|
||||||
pkwargs = {
|
pkwargs = {
|
||||||
'creationflags': flags,
|
'creationflags': flags,
|
||||||
|
'stdout': subprocess.PIPE,
|
||||||
|
'stderr': subprocess.PIPE
|
||||||
}
|
}
|
||||||
context.server_process = subprocess.Popen(
|
context.server_process = subprocess.Popen(
|
||||||
[str(arg) for arg in [context.server_path, *server_args]],
|
[str(arg) for arg in [context.server_path, *server_args]],
|
||||||
**pkwargs)
|
**pkwargs)
|
||||||
|
|
||||||
|
def log_stdout(process):
|
||||||
|
for line in iter(process.stdout.readline, b''):
|
||||||
|
print(line.decode('utf-8'), end='')
|
||||||
|
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
|
||||||
|
thread_stdout.start()
|
||||||
|
|
||||||
|
def log_stderr(process):
|
||||||
|
for line in iter(process.stderr.readline, b''):
|
||||||
|
print(line.decode('utf-8'), end='', file=sys.stderr)
|
||||||
|
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
|
||||||
|
thread_stderr.start()
|
||||||
|
|
||||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||||
|
@ -3,4 +3,5 @@ behave~=1.2.6
|
|||||||
huggingface_hub~=0.20.3
|
huggingface_hub~=0.20.3
|
||||||
numpy~=1.24.4
|
numpy~=1.24.4
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
|
psutil~=5.9.8
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
Loading…
Reference in New Issue
Block a user