mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
Include server in releases + other build system cleanups (#1610)
Set `LLAMA_BUILD_SERVER` in workflow so the `server` example gets build. This currently only applies to Windows builds because it seems like only Windows binary artifacts are included in releases. Add `server` example target to `Makefile` (still uses `LLAMA_BUILD_SERVER` define and does not build by default) Fix issue where `vdot` binary wasn't removed when running `make clean`. Fix compile warnings in `server` example. Add `.hpp` files to trigger workflow (the server example has one).
This commit is contained in:
parent
97c9b77c4f
commit
0df7d63e5b
16
.github/workflows/build.yml
vendored
16
.github/workflows/build.yml
vendored
@ -10,10 +10,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
@ -157,15 +157,15 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'avx2'
|
- build: 'avx2'
|
||||||
defines: ''
|
defines: '-DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'avx'
|
- build: 'avx'
|
||||||
defines: '-DLLAMA_AVX2=OFF'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||||
- build: 'avx512'
|
- build: 'avx512'
|
||||||
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast'
|
- build: 'clblast'
|
||||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||||
- build: 'openblas'
|
- build: 'openblas'
|
||||||
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -292,7 +292,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
|
13
Makefile
13
Makefile
@ -1,5 +1,11 @@
|
|||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
default: main quantize quantize-stats perplexity embedding vdot
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
|
||||||
|
|
||||||
|
ifdef LLAMA_BUILD_SERVER
|
||||||
|
BUILD_TARGETS += server
|
||||||
|
endif
|
||||||
|
|
||||||
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
|||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
|
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
|
|||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh > $@.tmp
|
@sh scripts/build-info.sh > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
@ -61,7 +61,7 @@ struct llama_server_context
|
|||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
int new_prompt_len = 0;
|
int new_prompt_len = 0;
|
||||||
for (int i = 0;i < prompt_tokens.size(); i++) {
|
for (size_t i = 0; i < prompt_tokens.size(); i++) {
|
||||||
if (i < processed_tokens.size() &&
|
if (i < processed_tokens.size() &&
|
||||||
processed_tokens[i] == prompt_tokens[i])
|
processed_tokens[i] == prompt_tokens[i])
|
||||||
{
|
{
|
||||||
@ -71,7 +71,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
embd_inp.push_back(prompt_tokens[i]);
|
embd_inp.push_back(prompt_tokens[i]);
|
||||||
if(new_prompt_len == 0) {
|
if(new_prompt_len == 0) {
|
||||||
if(i - 1 < n_past) {
|
if(int32_t(i) - 1 < n_past) {
|
||||||
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
||||||
}
|
}
|
||||||
// Evaluate the new fragment prompt from the last token processed.
|
// Evaluate the new fragment prompt from the last token processed.
|
||||||
@ -136,7 +136,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
// const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
@ -306,12 +306,12 @@ struct llama_server_context
|
|||||||
// Avoid add the no show words to the response
|
// Avoid add the no show words to the response
|
||||||
for (std::vector<llama_token> word_tokens : no_show_words)
|
for (std::vector<llama_token> word_tokens : no_show_words)
|
||||||
{
|
{
|
||||||
int match_token = 1;
|
size_t match_token = 1;
|
||||||
if (tokens_predicted.front() == word_tokens.front())
|
if (tokens_predicted.front() == word_tokens.front())
|
||||||
{
|
{
|
||||||
bool execute_matching = true;
|
bool execute_matching = true;
|
||||||
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
||||||
for (int i = 1; i < word_tokens.size(); i++)
|
for (size_t i = 1; i < word_tokens.size(); i++)
|
||||||
{
|
{
|
||||||
if (i >= tokens_predicted.size()) {
|
if (i >= tokens_predicted.size()) {
|
||||||
match_token = i;
|
match_token = i;
|
||||||
@ -601,7 +601,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
Server svr;
|
Server svr;
|
||||||
|
|
||||||
svr.Get("/", [](const Request &req, Response &res)
|
svr.Get("/", [](const Request &, Response &res)
|
||||||
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||||
@ -649,7 +649,7 @@ int main(int argc, char **argv)
|
|||||||
{"tokens_predicted", llama.num_tokens_predicted}};
|
{"tokens_predicted", llama.num_tokens_predicted}};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
}
|
}
|
||||||
catch (json::exception e)
|
catch (const json::exception &e)
|
||||||
{
|
{
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
@ -701,7 +701,7 @@ int main(int argc, char **argv)
|
|||||||
{"content", result },
|
{"content", result },
|
||||||
{"stop", !llama.has_next_token }};
|
{"stop", !llama.has_next_token }};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
} catch (json::exception e) {
|
} catch (const json::exception &e) {
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
{"content", "" },
|
{"content", "" },
|
||||||
|
Loading…
x
Reference in New Issue
Block a user