Merge branch 'master' into vgel/repeng

This commit is contained in:
Georgi Gerganov 2024-03-14 14:26:23 +02:00
commit 42abb46c1f
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
41 changed files with 1800 additions and 1146 deletions

View File

@ -333,6 +333,7 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \ -DLLAMA_BUILD_SERVER=OFF \
@ -361,6 +362,7 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DLLAMA_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \ -DLLAMA_BUILD_SERVER=OFF \

View File

@ -25,17 +25,14 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release] build_type: [Debug]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
exclude: - build_type: Debug
- build_type: Release
sanitizer: ADDRESS
- build_type: Release
sanitizer: THREAD sanitizer: THREAD
- build_type: Release disabled_on_pr: true
sanitizer: UNDEFINED fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container: container:
image: ubuntu:latest image: ubuntu:latest
@ -81,13 +78,14 @@ jobs:
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
@ -124,13 +122,14 @@ jobs:
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow behave.exe --stop --no-skipped --no-capture --tags slow

View File

@ -1,20 +0,0 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

View File

@ -1,23 +0,0 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

2
.gitignore vendored
View File

@ -25,6 +25,8 @@
.vscode/ .vscode/
.idea/ .idea/
ggml-metal-embed.metal
lcov-report/ lcov-report/
gcovr-report/ gcovr-report/

View File

@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL"
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
# enable libstdc++ assertions for debug builds # enable libstdc++ assertions for debug builds
if (CMAKE_SYSTEM_NAME MATCHES "Linux") if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>) add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
@ -197,9 +200,6 @@ if (LLAMA_METAL)
add_compile_definitions(GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG)
endif() endif()
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-common.h and ggml-metal.metal to bin directory # copy ggml-common.h and ggml-metal.metal to bin directory
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
@ -208,53 +208,62 @@ if (LLAMA_METAL)
enable_language(ASM) enable_language(ASM)
add_compile_definitions(GGML_METAL_EMBED_LIBRARY) add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
# merge ggml-common.h and ggml-metal.metal into a single file
set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
add_custom_command( add_custom_command(
OUTPUT ${EMBED_METALLIB_ASSEMBLY} OUTPUT ${METALLIB_EMBED_ASM}
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} COMMAND echo "Embedding Metal library"
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
DEPENDS ${METALLIB_SOURCE} COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
DEPENDS ggml-metal.metal ggml-common.h
COMMENT "Generate assembly for embedded Metal library" COMMENT "Generate assembly for embedded Metal library"
) )
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
endif() else()
if (LLAMA_METAL_SHADER_DEBUG)
if (LLAMA_METAL_SHADER_DEBUG) # custom command to do the following:
# custom command to do the following: # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib #
# # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works # disabling fast math is needed in order to pass tests/test-backend-ops
# disabling fast math is needed in order to pass tests/test-backend-ops # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib # ref: https://github.com/ggerganov/whisper.cpp/issues/1720
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720 set(XC_FLAGS -fno-fast-math -fno-inline -g)
set(XC_FLAGS -fno-fast-math -fno-inline -g) else()
if (LLAMA_QKK_64) set(XC_FLAGS -O3)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif() endif()
add_custom_command( add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
DEPENDS ggml-metal.metal COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
DEPENDS ggml-metal.metal ggml-common.h
COMMENT "Compiling Metal kernels" COMMENT "Compiling Metal kernels"
) )
add_custom_target( add_custom_target(
ggml-metal ALL ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
) )
endif() endif() # LLAMA_METAL_EMBED_LIBRARY
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}

View File

@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
MK_CPPFLAGS += -D_BSD_SOURCE MK_CPPFLAGS += -D_BSD_SOURCE
endif endif
ifdef LLAMA_SCHED_MAX_COPIES
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
endif
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
@ -553,15 +557,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ifdef LLAMA_METAL_EMBED_LIBRARY ifdef LLAMA_METAL_EMBED_LIBRARY
ggml-metal-embed.o: ggml-metal.metal ggml-metal-embed.o: ggml-metal.metal ggml-common.h
@echo "Embedding Metal library" @echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp)) $(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@ @$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY} @rm -f ${TEMP_ASSEMBLY}
endif endif

View File

@ -10,12 +10,14 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes ### Recent API changes
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
### Hot topics ### Hot topics
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
@ -902,6 +904,9 @@ First, install the essential packages for termux:
pkg install clang wget git cmake pkg install clang wget git cmake
``` ```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
``` ```
$ mkdir build-android $ mkdir build-android
$ cd build-android $ cd build-android
@ -910,7 +915,28 @@ $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROI
$ make $ make
``` ```
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card. Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone: Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

View File

@ -483,6 +483,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_batch = std::stoi(argv[i]); params.n_batch = std::stoi(argv[i]);
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ubatch = std::stoi(argv[i]);
} else if (arg == "--keep") { } else if (arg == "--keep") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1006,7 +1012,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" binary file containing multiple choice tasks.\n"); printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
printf(" -ub N, --ubatch-size N\n");
printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" (default: %s)\n", sampler_type_names.c_str()); printf(" (default: %s)\n", sampler_type_names.c_str());
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@ -1322,8 +1330,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx; cparams.n_ctx = params.n_ctx;
cparams.n_batch = params.n_batch;
cparams.n_seq_max = params.n_parallel; cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads; cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.seed = params.seed; cparams.seed = params.seed;
@ -1443,6 +1452,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), }; std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx); llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }
@ -1931,6 +1941,20 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
} }
} }
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
double sum = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < n; i++) {
sum += embd1[i] * embd2[i];
sum1 += embd1[i] * embd1[i];
sum2 += embd2[i] * embd2[i];
}
return sum / (sqrt(sum1) * sqrt(sum2));
}
// //
// Control vector utils // Control vector utils
// //

View File

@ -51,7 +51,8 @@ struct gpt_params {
int32_t n_threads_batch_draft = -1; int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
@ -270,6 +271,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
void llama_embd_normalize(const float * inp, float * out, int n); void llama_embd_normalize(const float * inp, float * out, int n);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
// //
// Control vector utils // Control vector utils
// //

View File

@ -17,6 +17,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
return nullptr; return nullptr;
} }
// Ensure that there is a "root" node.
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
delete result;
return nullptr;
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules()); std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init( result->grammar = llama_grammar_init(

View File

@ -138,6 +138,8 @@ int main(int argc, char ** argv) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
llama_synchronize(ctx);
} }
return true; return true;

View File

@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
// max batch size // max batch size
const uint64_t n_batch = params.n_batch; const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch == params.n_ctx); GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim // tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs; std::vector<std::vector<int32_t>> inputs;
@ -167,15 +167,26 @@ int main(int argc, char ** argv) {
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd);
// print first 3 embeddings // print the first part of the embeddings
for (int j = 0; j < std::min(3, n_prompts); j++) { fprintf(stdout, "\n");
fprintf(stderr, "embedding %d: ", j); for (int j = 0; j < n_prompts; j++) {
for (int i = 0; i < n_embd; i++) { fprintf(stdout, "embedding %d: ", j);
fprintf(stderr, "%f ", emb[j * n_embd + i]); for (int i = 0; i < std::min(16, n_embd); i++) {
fprintf(stdout, "%f ", emb[j * n_embd + i]);
} }
fprintf(stderr, "\n\n"); fprintf(stdout, "\n");
}
// print cosine similarity matrix
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "\n");
} }
fprintf(stderr, "\n");
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);

View File

@ -6,22 +6,6 @@
// #define GRIT_DEBUG // #define GRIT_DEBUG
static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
float dot = 0.0f;
for (uint64_t i = 0; i < v1.size(); ++i) {
dot += v1[i] * v2[i];
}
return dot;
}
static float norm(const std::vector<float> & v) {
return std::sqrt(dot_product(v, v));
}
static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
return dot_product(v1, v2) / (norm(v1) * norm(v2));
}
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) { static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result; std::vector<std::vector<float>> result;
@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); const int n_embd = llama_n_embd(mdl);
const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);

View File

@ -164,6 +164,7 @@ struct cmd_params {
std::vector<int> n_prompt; std::vector<int> n_prompt;
std::vector<int> n_gen; std::vector<int> n_gen;
std::vector<int> n_batch; std::vector<int> n_batch;
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
@ -183,7 +184,8 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_batch */ {512}, /* n_batch */ {2048},
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()}, /* n_threads */ {get_num_physical_cores()},
@ -208,6 +210,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
@ -217,7 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@ -297,6 +300,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") { } else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -455,6 +465,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
@ -474,6 +485,7 @@ struct cmd_params_instance {
int n_prompt; int n_prompt;
int n_gen; int n_gen;
int n_batch; int n_batch;
int n_ubatch;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
int n_threads; int n_threads;
@ -511,6 +523,7 @@ struct cmd_params_instance {
cparams.n_ctx = n_prompt + n_gen; cparams.n_ctx = n_prompt + n_gen;
cparams.n_batch = n_batch; cparams.n_batch = n_batch;
cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload; cparams.offload_kqv = !no_kv_offload;
@ -532,6 +545,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mmp : params.use_mmap) for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings) for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
@ -545,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ n_prompt, /* .n_prompt = */ n_prompt,
/* .n_gen = */ 0, /* .n_gen = */ 0,
/* .n_batch = */ nb, /* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
@ -568,6 +583,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ 0, /* .n_prompt = */ 0,
/* .n_gen = */ n_gen, /* .n_gen = */ n_gen,
/* .n_batch = */ nb, /* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
@ -604,6 +620,7 @@ struct test {
uint64_t model_size; uint64_t model_size;
uint64_t model_n_params; uint64_t model_n_params;
int n_batch; int n_batch;
int n_ubatch;
int n_threads; int n_threads;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
@ -627,6 +644,7 @@ struct test {
model_size = llama_model_size(lmodel); model_size = llama_model_size(lmodel);
model_n_params = llama_model_n_params(lmodel); model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
@ -705,7 +723,8 @@ struct test {
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
@ -719,7 +738,8 @@ struct test {
enum field_type {STRING, BOOL, INT, FLOAT}; enum field_type {STRING, BOOL, INT, FLOAT};
static field_type get_field_type(const std::string & field) { static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_threads" || if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" ||
field == "model_size" || field == "model_n_params" || field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" || field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "n_prompt" || field == "n_gen" ||
@ -759,7 +779,8 @@ struct test {
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -957,6 +978,9 @@ struct markdown_printer : public printer {
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch"); fields.emplace_back("n_batch");
} }
if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
fields.emplace_back("n_ubatch");
}
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
fields.emplace_back("type_k"); fields.emplace_back("type_k");
} }
@ -1096,25 +1120,32 @@ struct sql_printer : public printer {
}; };
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads);
//std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
//llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
//GGML_UNUSED(n_batch);
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx))); std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0; int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads);
while (n_processed < n_prompt) { while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch); int n_tokens = std::min(n_prompt - n_processed, n_batch);
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
n_processed += n_tokens; n_processed += n_tokens;
} }
llama_synchronize(ctx);
} }
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(llama_get_model(ctx));
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
llama_token token = llama_token_bos(llama_get_model(ctx));
for (int i = 0; i < n_gen; i++) { for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
llama_synchronize(ctx);
} }
} }
@ -1203,7 +1234,8 @@ int main(int argc, char ** argv) {
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads); test_gen(ctx, 1, 0, t.n_threads);
@ -1219,6 +1251,7 @@ int main(int argc, char ** argv) {
if (t.n_gen > 0) { if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
} }
uint64_t t_ns = get_time_ns() - t_start; uint64_t t_ns = get_time_ns() - t_start;
t.samples_ns.push_back(t_ns); t.samples_ns.push_back(t_ns);
} }

View File

@ -221,6 +221,7 @@ actor LlamaContext {
if llama_decode(context, batch) != 0 { if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt") print("llama_decode() failed during prompt")
} }
llama_synchronize(context)
let t_pp_end = ggml_time_us() let t_pp_end = ggml_time_us()
@ -240,6 +241,7 @@ actor LlamaContext {
if llama_decode(context, batch) != 0 { if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation") print("llama_decode() failed during text generation")
} }
llama_synchronize(context)
} }
let t_tg_end = ggml_time_us() let t_tg_end = ggml_time_us()

View File

@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
```console ```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
``` ```
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console ```console
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
``` ```
- you will find a llava.projector and a llava.clip file in your model directory - you will find a llava.projector and a llava.clip file in your model directory
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console ```console
mkdir vit mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
``` ```
4) Create the visual gguf model: 5) Create the visual gguf model:
```console ```console
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
``` ```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
5) Then convert the model to gguf format: 6) Then convert the model to gguf format:
```console ```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
``` ```
6) And finally we can run the llava-cli using the 1.6 model version: 7) And finally we can run the llava-cli using the 1.6 model version:
```console ```console
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
``` ```

View File

@ -589,9 +589,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
} }
} }
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) { if (i == 0) {
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count(); const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq); int total_seconds = (int)(t_total*n_chunk/n_seq);

View File

@ -147,7 +147,7 @@ struct server_slot {
int32_t n_decoded = 0; int32_t n_decoded = 0;
int32_t n_remaining = -1; int32_t n_remaining = -1;
int32_t i_batch = -1; int32_t i_batch = -1;
int32_t n_predict = -1; int32_t n_predict = -1; // TODO: disambiguate from params.n_predict
int32_t n_prompt_tokens = 0; int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0; int32_t n_prompt_tokens_processed = 0;
@ -739,7 +739,13 @@ struct server_context {
default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1; default_generation_settings_for_props["seed"] = -1;
batch = llama_batch_init(n_ctx, 0, params.n_parallel); // the update_slots() logic will always submit a maximum of n_batch tokens
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
{
const int32_t n_batch = llama_n_batch(ctx);
batch = llama_batch_init(n_batch, 0, params.n_parallel);
}
metrics.init(); metrics.init();
} }
@ -1036,8 +1042,10 @@ struct server_context {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false); llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
} }
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) { const int32_t n_batch = llama_n_batch(ctx);
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
llama_batch batch_view = { llama_batch batch_view = {
n_tokens, n_tokens,
batch.token + i, batch.token + i,
@ -1226,7 +1234,7 @@ struct server_context {
{"mirostat_eta", slot.sparams.mirostat_eta}, {"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl}, {"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt}, {"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict}, {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
{"n_keep", params.n_keep}, {"n_keep", params.n_keep},
{"ignore_eos", ignore_eos}, {"ignore_eos", ignore_eos},
{"stream", slot.params.stream}, {"stream", slot.params.stream},
@ -1738,7 +1746,8 @@ struct server_context {
} }
// process in chunks of params.n_batch // process in chunks of params.n_batch
int32_t n_batch = params.n_batch; int32_t n_batch = llama_n_batch(ctx);
int32_t n_ubatch = llama_n_ubatch(ctx);
// next, batch any pending prompts without exceeding n_batch // next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0) { if (params.cont_batching || batch.n_tokens == 0) {
@ -1811,7 +1820,7 @@ struct server_context {
if (slot.embedding) { if (slot.embedding) {
// this prompt is too large to process - discard it // this prompt is too large to process - discard it
if (slot.n_prompt_tokens > n_batch) { if (slot.n_prompt_tokens > n_ubatch) {
slot.state = SLOT_STATE_PROCESSING; slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE; slot.command = SLOT_COMMAND_NONE;
slot.release(); slot.release();
@ -2157,7 +2166,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
printf(" -dt N, --defrag-thold N\n"); printf(" -dt N, --defrag-thold N\n");
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_supports_mlock()) { if (llama_supports_mlock()) {
@ -2424,6 +2434,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break; break;
} }
params.n_batch = std::stoi(argv[i]); params.n_batch = std::stoi(argv[i]);
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ubatch = std::stoi(argv[i]);
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -2763,6 +2779,7 @@ int main(int argc, char ** argv) {
res.set_header("Access-Control-Allow-Credentials", "true"); res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST"); res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*"); res.set_header("Access-Control-Allow-Headers", "*");
return res.set_content("", "application/json; charset=utf-8");
}); });
svr->set_logger(log_server_request); svr->set_logger(log_server_request);
@ -3371,44 +3388,37 @@ int main(int argc, char ** argv) {
const json body = json::parse(req.body); const json body = json::parse(req.body);
bool is_openai = false; bool is_openai = false;
// an input prompt can string or a list of tokens (integer) // an input prompt can be a string or a list of tokens (integer)
std::vector<json> prompts; json prompt;
if (body.count("input") != 0) { if (body.count("input") != 0) {
is_openai = true; is_openai = true;
if (body["input"].is_array()) { prompt = body["input"];
// support multiple prompts
for (const json & elem : body["input"]) {
prompts.push_back(elem);
}
} else {
// single input prompt
prompts.push_back(body["input"]);
}
} else if (body.count("content") != 0) { } else if (body.count("content") != 0) {
// only support single prompt here // with "content", we only support single prompt
std::string content = body["content"]; prompt = std::vector<std::string>{body["content"]};
prompts.push_back(content);
} else { } else {
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
// process all prompts // create and queue the task
json responses = json::array(); json responses;
for (auto & prompt : prompts) { {
// TODO @ngxson : maybe support multitask for this endpoint?
// create and queue the task
const int id_task = ctx_server.queue_tasks.get_new_id(); const int id_task = ctx_server.queue_tasks.get_new_id();
ctx_server.queue_results.add_waiting_task_id(id_task); ctx_server.queue_results.add_waiting_task_id(id_task);
ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true); ctx_server.request_completion(id_task, -1, {{"prompt", prompt}}, false, true);
// get the result // get the result
server_task_result result = ctx_server.queue_results.recv(id_task); server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task);
if (!result.error) { if (!result.error) {
// append to the responses if (result.data.count("results")) {
responses.push_back(result.data); // result for multi-task
responses = result.data["results"];
} else {
// result for single task
responses = std::vector<json>{result.data};
}
} else { } else {
// error received, ignore everything else // error received, ignore everything else
res_error(res, result.data); res_error(res, result.data);
@ -3417,24 +3427,19 @@ int main(int argc, char ** argv) {
} }
// write JSON response // write JSON response
json root; json root = is_openai
if (is_openai) { ? format_embeddings_response_oaicompat(body, responses)
json res_oai = json::array(); : responses[0];
int i = 0;
for (auto & elem : responses) {
res_oai.push_back(json{
{"embedding", json_value(elem, "embedding", json::array())},
{"index", i++},
{"object", "embedding"}
});
}
root = format_embeddings_response_oaicompat(body, res_oai);
} else {
root = responses[0];
}
return res.set_content(root.dump(), "application/json; charset=utf-8"); return res.set_content(root.dump(), "application/json; charset=utf-8");
}; };
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
return false;
};
};
// //
// Router // Router
// //
@ -3446,17 +3451,6 @@ int main(int argc, char ** argv) {
} }
// using embedded static files // using embedded static files
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
return false;
};
};
svr->Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) {
// TODO @ngxson : I have no idea what it is... maybe this is redundant?
return res.set_content("", "application/json; charset=utf-8");
});
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));

View File

@ -9,6 +9,7 @@ Feature: llama.cpp server
And 42 as server seed And 42 as server seed
And 2 slots And 2 slots
And 1024 as batch size And 1024 as batch size
And 1024 as ubatch size
And 2048 KV cache size And 2048 KV cache size
And embeddings extraction And embeddings extraction
Then the server is starting Then the server is starting

View File

@ -33,6 +33,7 @@ def step_server_config(context, server_fqdn, server_port):
context.model_alias = None context.model_alias = None
context.n_batch = None context.n_batch = None
context.n_ubatch = None
context.n_ctx = None context.n_ctx = None
context.n_ga = None context.n_ga = None
context.n_ga_w = None context.n_ga_w = None
@ -118,6 +119,10 @@ def step_server_metrics(context):
def step_start_server(context): def step_start_server(context):
start_server_background(context) start_server_background(context)
attempts = 0 attempts = 0
max_attempts = 20
if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2
while True: while True:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((context.server_fqdn, context.server_port)) result = sock.connect_ex((context.server_fqdn, context.server_port))
@ -125,7 +130,7 @@ def step_start_server(context):
print("\x1b[33;46mserver started!\x1b[0m") print("\x1b[33;46mserver started!\x1b[0m")
return return
attempts += 1 attempts += 1
if attempts > 20: if attempts > max_attempts:
assert False, "server not started" assert False, "server not started"
print(f"waiting for server to start, connect error code = {result}...") print(f"waiting for server to start, connect error code = {result}...")
time.sleep(0.1) time.sleep(0.1)
@ -278,6 +283,11 @@ def step_n_batch(context, n_batch):
context.n_batch = n_batch context.n_batch = n_batch
@step('{n_ubatch:d} as ubatch size')
def step_n_ubatch(context, n_ubatch):
context.n_ubatch = n_ubatch
@step('{seed:d} as seed') @step('{seed:d} as seed')
def step_seed(context, seed): def step_seed(context, seed):
context.seed = seed context.seed = seed
@ -937,6 +947,9 @@ async def wait_for_health_status(context,
print(f"Starting checking for health for expected_health_status={expected_health_status}\n") print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
interval = 0.5 interval = 0.5
counter = 0 counter = 0
if 'GITHUB_ACTIONS' in os.environ:
timeout *= 2
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
while True: while True:
async with await session.get(f'{base_url}/health', params=params) as health_response: async with await session.get(f'{base_url}/health', params=params) as health_response:
@ -1029,6 +1042,8 @@ def start_server_background(context):
] ]
if context.n_batch: if context.n_batch:
server_args.extend(['--batch-size', context.n_batch]) server_args.extend(['--batch-size', context.n_batch])
if context.n_ubatch:
server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer: if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching: if context.server_continuous_batching:

View File

@ -529,6 +529,16 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
} }
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array();
int i = 0;
for (auto & elem : embeddings) {
data.push_back(json{
{"embedding", json_value(elem, "embedding", json::array())},
{"index", i++},
{"object", "embedding"}
});
}
json res = json { json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"}, {"object", "list"},
@ -536,7 +546,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
{"prompt_tokens", 0}, {"prompt_tokens", 0},
{"total_tokens", 0} {"total_tokens", 0}
}}, }},
{"data", embeddings} {"data", data}
}; };
return res; return res;

View File

@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
} }
} }
// TODO: GGML_PAD ?
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
assert(alignment && !(alignment & (alignment - 1))); // power of 2 assert(alignment && !(alignment & (alignment - 1))); // power of 2
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
} }
// tallocr // tallocr
struct ggml_tallocr {
ggml_backend_buffer_t buffer;
void * base;
size_t alignment;
size_t offset;
};
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
if (talloc == NULL) {
return NULL;
}
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
void * base = ggml_backend_buffer_get_base(buffer); void * base = ggml_backend_buffer_get_base(buffer);
size_t align = ggml_backend_buffer_get_alignment(buffer); size_t align = ggml_backend_buffer_get_alignment(buffer);
assert(align && !(align & (align - 1))); // power of 2 assert(align && !(align & (align - 1))); // power of 2
*talloc = (struct ggml_tallocr) { struct ggml_tallocr talloc = (struct ggml_tallocr) {
/*.buffer = */ buffer, /*.buffer = */ buffer,
/*.base = */ base, /*.base = */ base,
/*.alignment = */ align, /*.alignment = */ align,
@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
return talloc; return talloc;
} }
void ggml_tallocr_free(ggml_tallocr_t talloc) { void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
free(talloc);
}
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
size = GGML_PAD(size, talloc->alignment); size = GGML_PAD(size, talloc->alignment);
@ -354,12 +338,16 @@ struct hash_node {
bool allocated; bool allocated;
}; };
//
struct tensor_alloc { struct tensor_alloc {
size_t offset; size_t offset;
size_t size_max; // 0 = pre-allocated, unused, or view size_t size_max; // 0 = pre-allocated, unused, or view
}; };
struct leaf_alloc {
int buffer_id;
struct tensor_alloc leaf;
};
struct node_alloc { struct node_alloc {
int buffer_id; int buffer_id;
struct tensor_alloc dst; struct tensor_alloc dst;
@ -378,7 +366,7 @@ struct ggml_gallocr {
struct node_alloc * node_allocs; // [n_nodes] struct node_alloc * node_allocs; // [n_nodes]
int n_nodes; int n_nodes;
struct tensor_alloc * leaf_allocs; // [n_leafs] struct leaf_alloc * leaf_allocs; // [n_leafs]
int n_leafs; int n_leafs;
}; };
@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
return node_buffer_ids ? node_buffer_ids[i] : 0; return node_buffer_ids ? node_buffer_ids[i] : 0;
} }
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables // clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
}
// count number of children and views // count number of children and views
// allocate all graph inputs and leafs first to avoid overwriting them // allocate other graph inputs and leafs first to avoid overwriting them
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
} }
// allocate the remaining leafs that are unused on the graph
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
if (hn->n_children == 0) {
assert(!hn->allocated);
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
ggml_gallocr_allocate_node(galloc, leaf, 0);
}
}
// allocate tensors // allocate tensors
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
} }
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size; size_t hash_size = graph->visited_hash_table.size;
// initialize hash table // initialize hash table
@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
// allocate in hash table // allocate in hash table
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
// set the node_allocs from the hash table // set the node_allocs from the hash table
if (galloc->n_nodes < graph->n_nodes) { if (galloc->n_nodes < graph->n_nodes) {
@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
if (galloc->n_leafs < graph->n_leafs) { if (galloc->n_leafs < graph->n_leafs) {
free(galloc->leaf_allocs); free(galloc->leaf_allocs);
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
GGML_ASSERT(galloc->leaf_allocs != NULL); GGML_ASSERT(galloc->leaf_allocs != NULL);
} }
galloc->n_leafs = graph->n_leafs; galloc->n_leafs = graph->n_leafs;
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].offset = hn->offset; galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
} }
// reallocate buffers if needed // reallocate buffers if needed
@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
if (new_size > cur_size) { // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif #endif
@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL); return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
} }
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
if (node->view_src != NULL) { if (tensor->view_src != NULL) {
if (node->buffer == NULL) { if (tensor->buffer == NULL) {
assert(tensor_alloc->offset == SIZE_MAX); assert(tensor_alloc->offset == SIZE_MAX);
if (node->view_src->buffer == NULL) { if (tensor->view_src->buffer == NULL) {
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
return; return;
} }
ggml_backend_view_init(galloc->buffers[buffer_id], node); ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
} }
} else { } else {
if (node->data == NULL) { if (tensor->data == NULL) {
assert(tensor_alloc->offset != SIZE_MAX); assert(tensor_alloc->offset != SIZE_MAX);
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
void * addr = (char *)base + tensor_alloc->offset; void * addr = (char *)base + tensor_alloc->offset;
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
} else { } else {
if (node->buffer == NULL) { if (tensor->buffer == NULL) {
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
return; return;
} }
@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
// reset buffers // reset buffers
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// zero size buffers are not allocated
if (galloc->buffers[i] != NULL) { if (galloc->buffers[i] != NULL) {
ggml_backend_buffer_reset(galloc->buffers[i]); ggml_backend_buffer_reset(galloc->buffers[i]);
} }
} }
// allocate the graph tensors from the previous assignments // allocate the graph tensors from the previous assignments
// leafs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
}
// nodes // nodes
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
} }
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
} }
// leafs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
}
return true; return true;
} }
@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
return false; return false;
} }
struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer); struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) { if (t->data == NULL) {
if (t->view_src == NULL) { if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t); ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) { } else if (t->buffer == NULL) {
ggml_backend_view_init(buffer, t); ggml_backend_view_init(buffer, t);
} }
@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
} }
} }
ggml_tallocr_free(tallocr);
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer; (*buffers)[(*n_buffers)++] = buffer;

View File

@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator // Tensor allocator
typedef struct ggml_tallocr * ggml_tallocr_t; struct ggml_tallocr {
ggml_backend_buffer_t buffer;
void * base;
size_t alignment;
size_t offset;
};
GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer); GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc); GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
// Graph allocator // Graph allocator
/* /*
@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed // returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids); GGML_API bool ggml_gallocr_reserve_n(
ggml_gallocr_t galloc,
struct ggml_cgraph * graph,
const int * node_buffer_ids,
const int * leaf_buffer_ids);
// automatic reallocation if the topology changes when using a single buffer // automatic reallocation if the topology changes when using a single buffer
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

View File

@ -86,12 +86,12 @@ extern "C" {
// (optional) asynchronous tensor data access // (optional) asynchronous tensor data access
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations // (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// create a plan for ggml_cgraph and free it // compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
@ -102,16 +102,27 @@ extern "C" {
// check if the backend supports an operation // check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*GGML_CALL event_record) (ggml_backend_event_t event);
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
}; };
struct ggml_backend { struct ggml_backend {
ggml_guid_t guid; ggml_guid_t guid;
struct ggml_backend_i iface; struct ggml_backend_i iface;
ggml_backend_context_t context; ggml_backend_context_t context;
}; };
struct ggml_backend_event {
ggml_backend_t backend;
void * context;
};
// //
// Backend registry // Backend registry
// //

View File

@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (!size) { if (!size) {
return; return;
} }
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); buf->iface.set_tensor(buf, tensor, data, offset, size);
} }
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
if (!size) { if (!size) {
return; return;
} }
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); buf->iface.get_tensor(buf, tensor, data, offset, size);
} }
void ggml_backend_synchronize(ggml_backend_t backend) { void ggml_backend_synchronize(ggml_backend_t backend) {
@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
} }
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
return backend->iface.graph_plan_create(backend, cgraph); return backend->iface.graph_plan_create(backend, cgraph);
} }
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
backend->iface.graph_plan_free(backend, plan); backend->iface.graph_plan_free(backend, plan);
} }
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
return backend->iface.graph_plan_compute(backend, plan); return backend->iface.graph_plan_compute(backend, plan);
} }
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
ggml_backend_synchronize(backend);
return err;
}
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph); return backend->iface.graph_compute(backend, cgraph);
} }
@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
} }
} }
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) { if (src == dst) {
return; return;
} }
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { if (backend_dst->iface.cpy_tensor_async != NULL) {
if (backend->iface.cpy_tensor_async != NULL) { if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
if (backend->iface.cpy_tensor_async(backend, src, dst)) { return;
return;
}
} }
} }
size_t nbytes = ggml_nbytes(src); // an async copy would normally happen after all the queued operations on both backends are completed
// sync src, set_async dst
if (ggml_backend_buffer_is_host(src->buffer)) { if (ggml_backend_buffer_is_host(src->buffer)) {
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); ggml_backend_synchronize(backend_src);
} ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
else { } else {
ggml_backend_synchronize(backend_src);
ggml_backend_tensor_copy(src, dst); ggml_backend_tensor_copy(src, dst);
ggml_backend_synchronize(backend_dst);
} }
} }
// events
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
if (backend->iface.event_new == NULL) {
return NULL;
}
return backend->iface.event_new(backend);
}
void ggml_backend_event_free(ggml_backend_event_t event) {
if (event == NULL) {
return;
}
event->backend->iface.event_free(event);
}
void ggml_backend_event_record(ggml_backend_event_t event) {
GGML_ASSERT(event->backend->iface.event_record != NULL);
event->backend->iface.event_record(event);
}
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
event->backend->iface.event_synchronize(event);
}
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
GGML_ASSERT(backend->iface.event_wait != NULL);
backend->iface.event_wait(backend, event);
}
// backend registry // backend registry
#define GGML_MAX_BACKENDS_REG 16 #define GGML_REG_MAX_BACKENDS 16
struct ggml_backend_reg { struct ggml_backend_reg {
char name[128]; char name[128];
@ -350,7 +396,7 @@ struct ggml_backend_reg {
void * user_data; void * user_data;
}; };
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG]; static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
static size_t ggml_backend_registry_count = 0; static size_t ggml_backend_registry_count = 0;
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
} }
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG); GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
size_t id = ggml_backend_registry_count; size_t id = ggml_backend_registry_count;
@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
if (cpu_ctx->work_size < cplan.work_size) { if (cpu_ctx->work_size < cplan.work_size) {
// TODO: may be faster to free and use malloc to avoid the copy free(cpu_ctx->work_data);
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); cpu_ctx->work_data = malloc(cplan.work_size);
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size; cpu_ctx->work_size = cplan.work_size;
} }
cplan.work_data = cpu_ctx->work_data; cplan.work_data = cpu_ctx->work_data;
@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op, /* .supports_op = */ ggml_backend_cpu_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
}; };
static ggml_guid_t ggml_backend_cpu_guid(void) { static ggml_guid_t ggml_backend_cpu_guid(void) {
@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
// scheduler // scheduler
#define GGML_MAX_BACKENDS 16 #ifndef GGML_SCHED_MAX_BACKENDS
#define GGML_MAX_SPLITS 256 #define GGML_SCHED_MAX_BACKENDS 16
#define GGML_MAX_SPLIT_INPUTS 16 #endif
#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 256
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
#endif
#ifndef GGML_SCHED_MAX_COPIES
#define GGML_SCHED_MAX_COPIES 4
#endif
struct ggml_backend_sched_split { struct ggml_backend_sched_split {
int backend_id; int backend_id;
int i_start; int i_start;
int i_end; int i_end;
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
int n_inputs; int n_inputs;
// graph view of this split // graph view of this split
struct ggml_cgraph graph; struct ggml_cgraph graph;
@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
struct ggml_backend_sched { struct ggml_backend_sched {
bool is_reset; // true if the scheduler has been reset since the last graph split bool is_reset; // true if the scheduler has been reset since the last graph split
bool is_alloc;
int n_backends; int n_backends;
ggml_backend_t backends[GGML_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
ggml_gallocr_t galloc; ggml_gallocr_t galloc;
// hash keys of the nodes in the graph // hash keys of the nodes in the graph
struct ggml_hash_set hash_set; struct ggml_hash_set hash_set;
// hash values // hash values
int * tensor_backend_id; int * tensor_backend_id;
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS]; struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
int * node_backend_ids; // [n_nodes] int * node_backend_ids; // [graph_size]
int n_nodes; int * leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs // copy of the graph with modified inputs
struct ggml_cgraph * graph; struct ggml_cgraph * graph;
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; // graph splits
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
int n_splits; int n_splits;
// pipeline parallelism support
int n_copies;
int cur_copy;
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
int n_graph_inputs;
struct ggml_context * ctx; struct ggml_context * ctx;
ggml_backend_sched_eval_callback callback_eval; ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data; void * callback_eval_user_data;
// align context_buffer to GGML_MEM_ALIGN // align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER #ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN)) __declspec(align(GGML_MEM_ALIGN))
#else #else
__attribute__((aligned(GGML_MEM_ALIGN))) __attribute__((aligned(GGML_MEM_ALIGN)))
#endif #endif
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
}; };
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)] #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
// returns the priority of the backend, lower id is higher priority // returns the priority of the backend, lower id is higher priority
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
return -1; return -1;
} }
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
ggml_backend_buffer_t buffer = tensor->buffer;
if (buffer == NULL) { if (buffer == NULL) {
return -1; return -1;
} }
@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
return i; return i;
} }
} }
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
return -1; // silence warning fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
__func__, ggml_backend_buffer_name(buffer), tensor->name);
GGML_ASSERT(false);
return -1;
} }
#if 0 #if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)] #define GET_CAUSE(node) causes[hash_id(node)]
#else #else
@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// assign pre-allocated nodes to their backend // assign pre-allocated nodes to their backend
// dst // dst
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer); int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
if (cur_backend != -1) { if (cur_backend != -1) {
SET_CAUSE(node, "1.dst"); SET_CAUSE(tensor, "1.dst");
return cur_backend; return cur_backend;
} }
// view_src // view_src
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer); cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
if (cur_backend != -1) { if (cur_backend != -1) {
SET_CAUSE(node, "1.vsrc"); SET_CAUSE(tensor, "1.vsrc");
return cur_backend; return cur_backend;
} }
} }
// input
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
SET_CAUSE(tensor, "1.inp");
return cur_backend;
}
// assign nodes that use weights to the backend of the weights // assign nodes that use weights to the backend of the weights
for (int i = 0; i < GGML_MAX_SRC; i++) { for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i]; const struct ggml_tensor * src = tensor->src[i];
@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
continue; continue;
} }
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer); int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
// operations with weights are always run on the same backend as the weights // operations with weights are always run on the same backend as the weights
SET_CAUSE(node, "1.wgt%d", i); SET_CAUSE(tensor, "1.wgt%d", i);
return src_backend; return src_backend;
} }
} }
@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
ggml_backend_t tensor_backend = tensor_backend(node); ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
ggml_backend_t src_backend = tensor_backend(src); ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
} }
@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
// reset splits // reset splits
sched->n_splits = 0; sched->n_splits = 0;
sched->n_graph_inputs = 0;
sched->is_reset = false; sched->is_reset = false;
struct ggml_init_params params = { struct ggml_init_params params = {
@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
#ifdef DEBUG_PASS1 #ifdef DEBUG_PASS1
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif #endif
// pass 2: expand current backend assignments // pass 2: expand current backend assignments
@ -1171,28 +1261,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
// pass 2.1 expand gpu up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
if (tensor_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
cur_backend_id = tensor_backend_id;
}
} else {
tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.1");
}
}
}
// pass 2.2 expand gpu down // pass 2.2 expand gpu down
{ {
@ -1217,7 +1285,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
// pass 2.3 expand rest up // pass 2.1 expand gpu up
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) { for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1227,14 +1295,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
int tensor_backend_id = tensor_backend_id(node); int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) { if (tensor_backend_id != -1) {
cur_backend_id = tensor_backend_id; if (tensor_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
cur_backend_id = tensor_backend_id;
}
} else { } else {
tensor_backend_id(node) = cur_backend_id; tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.3"); SET_CAUSE(node, "2.1");
} }
} }
} }
// pass 2.4 expand rest down // pass 2.4 expand rest down
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
@ -1252,8 +1326,26 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
} }
// pass 2.3 expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
cur_backend_id = tensor_backend_id;
} else {
tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.3");
}
}
}
#ifdef DEBUG_PASS2 #ifdef DEBUG_PASS2
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif #endif
// pass 3: assign backends to remaining src from dst and view_src // pass 3: assign backends to remaining src from dst and view_src
@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
#ifdef DEBUG_PASS3 #ifdef DEBUG_PASS3
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif #endif
// pass 4: split graph, find tensors that need to be copied // pass 4: split graph, find tensors that need to be copied
@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (tensor_backend_id != cur_backend_id) { if (tensor_backend_id != cur_backend_id) {
sched->splits[cur_split].i_end = i; sched->splits[cur_split].i_end = i;
cur_split++; cur_split++;
GGML_ASSERT(cur_split < GGML_MAX_SPLITS); GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
sched->splits[cur_split].backend_id = tensor_backend_id; sched->splits[cur_split].backend_id = tensor_backend_id;
sched->splits[cur_split].i_start = i; sched->splits[cur_split].i_start = i;
sched->splits[cur_split].n_inputs = 0; sched->splits[cur_split].n_inputs = 0;
@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
int src_backend_id = tensor_backend_id(src); int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy;
if (c == sched->cur_copy) {
tensor_copy = src; // use the original tensor as the current copy
} else {
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
}
if (sched->n_copies > 1) {
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = src_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->graph_inputs[n_graph_inputs] = src;
}
}
if (src_backend_id != tensor_backend_id) { if (src_backend_id != tensor_backend_id) {
// create a copy of the input in the split's backend // create a copy of the input in the split's backend
size_t id = hash_id(src); size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id] == NULL) { if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id]; ggml_backend_t backend = sched->backends[cur_backend_id];
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); for (int c = 0; c < sched->n_copies; c++) {
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
sched->tensor_copies[id][cur_backend_id] = tensor_copy; if (sched->n_copies > 1) {
tensor_backend_id(tensor_copy) = cur_backend_id; ggml_set_input(tensor_copy);
SET_CAUSE(tensor_copy, "4.cpy"); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = cur_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_inputs = sched->splits[cur_split].n_inputs++; int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src; sched->splits[cur_split].inputs[n_inputs] = src;
} }
node->src[j] = sched->tensor_copies[id][cur_backend_id]; node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
} }
} }
} }
@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->n_splits = cur_split + 1; sched->n_splits = cur_split + 1;
} }
#ifdef DEBUG_PASS4 #ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif #endif
#ifndef NDEBUG #ifndef NDEBUG
// sanity check: all sources should have the same backend as the node // sanity check: all sources should have the same backend as the node
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
ggml_backend_t tensor_backend = tensor_backend(node); ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
if (tensor_backend == NULL) { if (tensor_backend == NULL) {
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
} }
if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) { if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL"); node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
ggml_backend_t src_backend = tensor_backend(src); ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
if (src_backend != tensor_backend /* && src_backend != NULL */) { if (src_backend != tensor_backend /* && src_backend != NULL */) {
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
} }
if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) { if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL"); src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
} }
} }
} }
@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
#endif #endif
// create copies of the graph for each split // create copies of the graph for each split
// FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way // TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false); struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
for (int i = 0; i < sched->n_splits; i++) { for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i]; struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end); split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) { for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id]; struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
// add a dependency to the input source so that it is not freed before the copy is done // add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input;
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
graph_copy->nodes[graph_copy->n_nodes++] = input_dep; graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
} }
} }
if (sched->n_copies > 1) {
// add input copies as leafs so that they are allocated first
for (int i = 0; i < sched->n_graph_inputs; i++) {
struct ggml_tensor * input = sched->graph_inputs[i];
size_t id = hash_id(input);
int backend_id = tensor_backend_id(input);
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
}
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
int backend_id = split->backend_id;
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j];
size_t id = hash_id(input);
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
}
}
}
// add leafs from the original graph
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
}
sched->graph = graph_copy; sched->graph = graph_copy;
} }
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
// ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); // allocate graph
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n"); fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
#endif #endif
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n"); fprintf(stderr, "%s: failed to allocate graph\n", __func__);
return false; return false;
} }
} }
@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
} }
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
struct ggml_backend_sched_split * splits = sched->splits; struct ggml_backend_sched_split * splits = sched->splits;
for (int i = 0; i < sched->n_splits; i++) { for (int i = 0; i < sched->n_splits; i++) {
@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
ggml_backend_t split_backend = sched->backends[split_backend_id]; ggml_backend_t split_backend = sched->backends[split_backend_id];
// copy the input tensors to the split backend // copy the input tensors to the split backend
uint64_t copy_start_us = ggml_time_us();
for (int j = 0; j < split->n_inputs; j++) { for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id]; struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
GGML_ASSERT(input->buffer != NULL); if (input->flags & GGML_TENSOR_FLAG_INPUT) {
GGML_ASSERT(input_cpy->buffer != NULL); // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
} else {
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
ggml_backend_synchronize(input_backend);
}
ggml_backend_tensor_copy_async(split_backend, input, input_cpy); ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
}
} }
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
int64_t copy_end_us = ggml_time_us();
copy_us[split_backend_id] += copy_end_us - copy_start_us;
#if 0
char split_filename[GGML_MAX_NAME];
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
ggml_graph_dump_dot(split->graph, NULL, split_filename);
#endif
uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) { if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph); enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) { if (ec != GGML_STATUS_SUCCESS) {
return ec; return ec;
} }
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
} else { } else {
// similar to ggml_backend_compare_graph_backend // similar to ggml_backend_compare_graph_backend
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv); enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
if (ec != GGML_STATUS_SUCCESS) { if (ec != GGML_STATUS_SUCCESS) {
return ec; return ec;
} }
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
ggml_backend_synchronize(split_backend);
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
break; break;
} }
@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
j0 = j1; j0 = j1;
} }
} }
uint64_t compute_end_us = ggml_time_us();
compute_us[split_backend_id] += compute_end_us - compute_start_us;
}
#if 0 // record the event of this copy
// per-backend timings if (split->n_inputs > 0) {
fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits); if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
for (int i = 0; i < sched->n_backends; i++) { ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
if (copy_us[i] > 0 || compute_us[i] > 0) { }
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
} }
} }
#endif
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_t * backends,
ggml_backend_buffer_type_t * bufts,
int n_backends,
size_t graph_size,
bool parallel) {
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
// initialize hash table // initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
sched->n_backends = n_backends; sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
sched->backends[i] = backends[i]; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]);
}
}
} }
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
if (sched == NULL) { if (sched == NULL) {
return; return;
} }
for (int b = 0; b < sched->n_backends; b++) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
}
}
ggml_gallocr_free(sched->galloc); ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx); ggml_free(sched->ctx);
free(sched->hash_set.keys); free(sched->hash_set.keys);
free(sched->tensor_backend_id); free(sched->tensor_backend_id);
free(sched->tensor_copies); free(sched->tensor_copies);
free(sched->node_backend_ids); free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
free(sched); free(sched);
} }
@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
sched->is_reset = true; sched->is_reset = true;
sched->is_alloc = false;
} }
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
ggml_backend_sched_split_graph(sched, measure_graph); ggml_backend_sched_split_graph(sched, measure_graph);
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) { // TODO: extract this to a separate function
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false; return false;
} }
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
ggml_backend_sched_synchronize(sched);
return true;
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
ggml_backend_sched_split_graph(sched, graph);
if (!ggml_backend_sched_alloc_splits(sched)) {
return false;
}
sched->is_alloc = true;
return true; return true;
} }
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
ggml_backend_sched_synchronize(sched);
return err;
}
if (!sched->is_reset) { enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
if (!sched->is_reset && !sched->is_alloc) {
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
} }
ggml_backend_sched_split_graph(sched, graph); if (!sched->is_alloc) {
if (!ggml_backend_sched_alloc_splits(sched)) { if (!ggml_backend_sched_alloc_graph(sched, graph)) {
return GGML_STATUS_ALLOC_FAILED; return GGML_STATUS_ALLOC_FAILED;
}
} }
return ggml_backend_sched_compute_splits(sched); return ggml_backend_sched_compute_splits(sched);
} }
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
for (int i = 0; i < sched->n_backends; i++) {
ggml_backend_synchronize(sched->backends[i]);
}
}
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
sched->callback_eval = callback; sched->callback_eval = callback;
sched->callback_eval_user_data = user_data; sched->callback_eval_user_data = user_data;
@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
return sched->n_splits; return sched->n_splits;
} }
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
return sched->n_copies;
}
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
} }
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index; tensor_backend_id(node) = backend_index;
} }
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
int backend_index = tensor_backend_id(node); int backend_index = tensor_backend_id(node);
if (backend_index == -1) { if (backend_index == -1) {
return NULL; return NULL;

View File

@ -9,6 +9,7 @@ extern "C" {
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t; typedef void * ggml_backend_graph_plan_t;
@ -72,11 +73,24 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends // tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
// //
// CPU backend // CPU backend
@ -123,27 +137,31 @@ extern "C" {
/* /*
Example usage: Example usage:
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends); // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph // preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
// initialize buffers from a measure graph sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
// in build_graph: // initialize buffers from a max size graph (optional)
build_graph(...) { reserve_graph = build_graph(sched, max_batch_size);
// manually assign nodes to a backend (optional, should not be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
}
// allocate backend buffers from measure graph // manually assign nodes to a backend (optional, should not be needed in most cases)
ggml_backend_sched_init_measure(sched, measure_graph); struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
// the scheduler is now ready to compute graphs ggml_backend_sched_reserve(sched, reserve_graph);
// compute // compute
graph = build_graph(sched); graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph); ggml_backend_sched_graph_compute(sched, graph);
// if there are graph inputs:
ggml_backend_sched_reset(sched);
ggml_backend_sched_alloc_graph(sched, graph);
ggml_backend_tensor_set(input_tensor, ...);
ggml_backend_sched_graph_compute(sched, graph);
}
*/ */
struct ggml_backend_sched; struct ggml_backend_sched;
@ -158,20 +176,26 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler // Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph // Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends // Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);

View File

@ -72,6 +72,7 @@
#define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming #define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord #define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t #define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy #define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree #define cudaFree hipFree
@ -81,6 +82,7 @@
#define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString #define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError #define cudaGetLastError hipGetLastError
#define cudaLaunchHostFunc hipLaunchHostFunc
#ifdef GGML_HIP_UMA #ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged #define cudaMalloc hipMallocManaged
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@ -104,6 +106,7 @@
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamFireAndForget hipStreamFireAndForget #define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t #define cudaStream_t hipStream_t
@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
struct ggml_backend_cuda_context { struct ggml_backend_cuda_context {
explicit ggml_backend_cuda_context(int device) :
device(device),
name(GGML_CUDA_NAME + std::to_string(device)) {
}
~ggml_backend_cuda_context() {
if (copy_event != nullptr) {
CUDA_CHECK(cudaEventDestroy(copy_event));
}
}
int device; int device;
std::string name; std::string name;
cudaEvent_t copy_event = nullptr;
}; };
// cuda buffer // cuda buffer
@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
CUDA_CHECK(cudaDeviceSynchronize());
} }
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
CUDA_CHECK(cudaDeviceSynchronize());
} }
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_backend_buffer_is_cuda(src->buffer)) { if (ggml_backend_buffer_is_cuda(src->buffer)) {
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
if (src_ctx->device == dst_ctx->device) {
ggml_cuda_set_device(src_ctx->device); CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaDeviceSynchronize()); } else {
ggml_cuda_set_device(dst_ctx->device); CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
CUDA_CHECK(cudaDeviceSynchronize()); }
CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
CUDA_CHECK(cudaDeviceSynchronize());
return true; return true;
} }
return false; return false;
UNUSED(buffer);
} }
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
} }
const char * buf_host = (const char *)data + offset_split; const char * buf_host = (const char *)data + offset_split;
CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
}
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
} }
} }
@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
} }
char * buf_host = (char *)data + offset_split; char * buf_host = (char *)data + offset_split;
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
}
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
} }
} }
@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
return &ggml_backend_cuda_buffer_type_host; return &ggml_backend_cuda_buffer_type_host;
} }
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
//}
// backend // backend
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
} }
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0])); ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
return true;
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
return false;
} }
return false; if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
// device -> device
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
if (backend_src != backend_dst) {
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
if (!cuda_ctx_src->copy_event) {
ggml_cuda_set_device(cuda_ctx_src->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
}
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
} else {
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
}
// record event on src stream
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
// wait on dst stream for the copy to complete
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
} else {
// src and dst are on the same backend
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
}
return true;
} }
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
UNUSED(backend); UNUSED(backend);
} }
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
cudaEvent_t event;
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
return new ggml_backend_event {
/* .backend = */ backend,
/* .context = */ event,
};
}
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
delete event;
}
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
}
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
if (ggml_backend_is_cuda(event->backend)) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
} else {
// untested
auto wait_fn = [](void * user_data) {
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
ggml_backend_event_synchronize(event);
};
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
}
}
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
static ggml_backend_i ggml_backend_cuda_interface = { static ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_name, /* .get_name = */ ggml_backend_cuda_name,
/* .free = */ ggml_backend_cuda_free, /* .free = */ ggml_backend_cuda_free,
@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op, /* .supports_op = */ ggml_backend_cuda_supports_op,
/* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
}; };
static ggml_guid_t ggml_backend_cuda_guid() { static ggml_guid_t ggml_backend_cuda_guid() {
@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
// not strictly necessary, but it may reduce the overhead of the first graph_compute // not strictly necessary, but it may reduce the overhead of the first graph_compute
ggml_cuda_set_main_device(device); ggml_cuda_set_main_device(device);
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context { ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
/* .device = */ device, if (ctx == nullptr) {
/* .name = */ GGML_CUDA_NAME + std::to_string(device), fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
}; return nullptr;
}
ggml_backend_t cuda_backend = new ggml_backend { ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(), /* .guid = */ ggml_backend_cuda_guid(),

View File

@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_op = */ ggml_backend_kompute_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
}; };
static ggml_guid_t ggml_backend_kompute_guid() { static ggml_guid_t ggml_backend_kompute_guid() {

View File

@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
id<MTLLibrary> metal_library; id<MTLLibrary> metal_library;
// load library // load library
//
// - first check if the library is embedded
// - then check if the library is in the bundle
// - if not found, load the source and compile it
// - if that fails, return NULL
{ {
NSBundle * bundle = nil; NSBundle * bundle = nil;
#ifdef SWIFT_PACKAGE #ifdef SWIFT_PACKAGE
@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#else #else
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif #endif
NSError * error = nil; NSError * error = nil;
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
if (libPath != nil) { #if GGML_METAL_EMBED_LIBRARY
const bool try_metallib = false;
#else
const bool try_metallib = true;
#endif
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
if (try_metallib && path_lib != nil) {
// pre-compiled library found // pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath]; NSURL * libURL = [NSURL fileURLWithPath:path_lib];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
extern const char ggml_metallib_start[]; extern const char ggml_metallib_start[];
extern const char ggml_metallib_end[]; extern const char ggml_metallib_end[];
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else #else
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * sourcePath; NSString * path_source;
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
if (ggmlMetalPathResources) { if (path_resource) {
sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
} else { } else {
sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
} }
if (sourcePath == nil) {
if (path_source == nil) {
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
sourcePath = @"ggml-metal.metal"; path_source = @"ggml-metal.metal";
} }
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
#endif #endif // GGML_METAL_EMBED_LIBRARY
@autoreleasepool { @autoreleasepool {
// dictionary of preprocessor macros // dictionary of preprocessor macros
@ -2820,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op, /* .supports_op = */ ggml_backend_metal_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
}; };
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {

View File

@ -4,9 +4,6 @@
#include <metal_stdlib> #include <metal_stdlib>
#define GGML_COMMON_IMPL_METAL
#include "ggml-common.h"
using namespace metal; using namespace metal;
#define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MAX(x, y) ((x) > (y) ? (x) : (y))

View File

@ -202,24 +202,29 @@ namespace dpct
// Version string has the following format: // Version string has the following format:
// a. OpenCL<space><major.minor><space><vendor-specific-information> // a. OpenCL<space><major.minor><space><vendor-specific-information>
// b. <major.minor> // b. <major.minor>
// c. <AmdGcnArchName> e.g gfx1030
std::string ver; std::string ver;
ver = dev.get_info<sycl::info::device::version>(); ver = dev.get_info<sycl::info::device::version>();
std::string::size_type i = 0; std::string::size_type i = 0;
while (i < ver.size()) while (i < ver.size()) {
{ if (isdigit(ver[i]))
if (isdigit(ver[i])) break;
break; i++;
i++;
} }
major = std::stoi(&(ver[i])); major = std::stoi(&(ver[i]));
while (i < ver.size()) while (i < ver.size()) {
{ if (ver[i] == '.')
if (ver[i] == '.') break;
break; i++;
i++; }
if (i < ver.size()) {
// a. and b.
i++;
minor = std::stoi(&(ver[i]));
} else {
// c.
minor = 0;
} }
i++;
minor = std::stoi(&(ver[i]));
} }
template <typename tag, typename T> template <typename tag, typename T>
@ -17244,13 +17249,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async, /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
/* .synchronize = */ ggml_backend_sycl_synchronize, /* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ ggml_backend_sycl_supports_op, /* .supports_op = */ ggml_backend_sycl_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
}; };
static ggml_guid_t ggml_backend_sycl_guid() { static ggml_guid_t ggml_backend_sycl_guid() {

View File

@ -5693,6 +5693,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute, /* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op, /* .supports_op = */ ggml_backend_vk_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
}; };
static ggml_guid_t ggml_backend_vk_guid() { static ggml_guid_t ggml_backend_vk_guid() {

113
ggml.c
View File

@ -11560,8 +11560,6 @@ static void ggml_compute_forward_get_rows_q(
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return; return;
} }
@ -11569,7 +11567,7 @@ static void ggml_compute_forward_get_rows_q(
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00; const int64_t nc = ne00;
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); const int64_t nr = ggml_nelements(src1);
const enum ggml_type type = src0->type; const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@ -11579,17 +11577,25 @@ static void ggml_compute_forward_get_rows_q(
assert(nb00 == ggml_type_size(type)); assert(nb00 == ggml_type_size(type));
assert(ggml_nrows(dst) == nr); assert(ggml_nrows(dst) == nr);
// TODO: multi-thread const int ith = params->ith;
for (int64_t i12 = 0; i12 < ne12; ++i12) { const int nth = params->nth;
for (int64_t i11 = 0; i11 < ne11; ++i11) {
for (int64_t i10 = 0; i10 < ne10; ++i10) {
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
dequantize_row_q( // rows per thread
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), const int dr = (nr + nth - 1)/nth;
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} // row range for this thread
} const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int64_t i = ir0; i < ir1; ++i) {
const int64_t i12 = i/(ne11*ne10);
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} }
} }
@ -11600,8 +11606,6 @@ static void ggml_compute_forward_get_rows_f16(
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return; return;
} }
@ -11609,24 +11613,32 @@ static void ggml_compute_forward_get_rows_f16(
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00; const int64_t nc = ne00;
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); const int64_t nr = ggml_nelements(src1);
assert(ne0 == nc); assert(ne0 == nc);
assert(ne02 == ne11); assert(ne02 == ne11);
assert(nb00 == sizeof(ggml_fp16_t)); assert(nb00 == sizeof(ggml_fp16_t));
assert(ggml_nrows(dst) == nr); assert(ggml_nrows(dst) == nr);
// TODO: multi-thread const int ith = params->ith;
for (int64_t i12 = 0; i12 < ne12; ++i12) { const int nth = params->nth;
for (int64_t i11 = 0; i11 < ne11; ++i11) {
for (int64_t i10 = 0; i10 < ne10; ++i10) {
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
ggml_fp16_to_fp32_row( // rows per thread
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), const int dr = (nr + nth - 1)/nth;
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} // row range for this thread
} const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int64_t i = ir0; i < ir1; ++i) {
const int64_t i12 = i/(ne11*ne10);
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
ggml_fp16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} }
} }
@ -11637,8 +11649,6 @@ static void ggml_compute_forward_get_rows_f32(
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return; return;
} }
@ -11646,24 +11656,32 @@ static void ggml_compute_forward_get_rows_f32(
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const int64_t nc = ne00; const int64_t nc = ne00;
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); const int64_t nr = ggml_nelements(src1);
assert(ne0 == nc); assert(ne0 == nc);
assert(ne02 == ne11); assert(ne02 == ne11);
assert(nb00 == sizeof(float)); assert(nb00 == sizeof(float));
assert(ggml_nrows(dst) == nr); assert(ggml_nrows(dst) == nr);
// TODO: multi-thread const int ith = params->ith;
for (int64_t i12 = 0; i12 < ne12; ++i12) { const int nth = params->nth;
for (int64_t i11 = 0; i11 < ne11; ++i11) {
for (int64_t i10 = 0; i10 < ne10; ++i10) {
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
ggml_vec_cpy_f32(nc, // rows per thread
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), const int dr = (nr + nth - 1)/nth;
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
} // row range for this thread
} const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int64_t i = ir0; i < ir1; ++i) {
const int64_t i12 = i/(ne11*ne10);
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
} }
} }
@ -17796,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
node->perf_time_us += time_us_cur; node->perf_time_us += time_us_cur;
} }
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
int n_tasks = 0; int n_tasks = 0;
switch (node->op) { switch (node->op) {
@ -17877,6 +17895,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_GET_ROWS:
{
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
} break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SET: case GGML_OP_SET:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -17884,7 +17908,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
case GGML_OP_GET_ROWS:
case GGML_OP_GET_ROWS_BACK: case GGML_OP_GET_ROWS_BACK:
case GGML_OP_DIAG: case GGML_OP_DIAG:
{ {
@ -18102,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/* FINALIZE */ /* FINALIZE */
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
if (GGML_OP_HAS_FINALIZE[node->op]) { if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = ggml_get_n_tasks(node, n_threads); params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
} }
ggml_graph_compute_perf_stats_node(node, state->shared); ggml_graph_compute_perf_stats_node(node, state->shared);
@ -18112,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
while (++node_n < cgraph->n_nodes) { while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us(); state->shared->perf_node_start_time_us = ggml_perf_time_us();
@ -18160,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/* INIT & COMPUTE */ /* INIT & COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
struct ggml_compute_params params = { struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_INIT, /*.type =*/ GGML_TASK_TYPE_INIT,
@ -18225,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_tensor * node = cgraph->nodes[i];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
max_tasks = MAX(max_tasks, n_tasks); max_tasks = MAX(max_tasks, n_tasks);

64
ggml.h
View File

@ -337,24 +337,24 @@ extern "C" {
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
// NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type { enum ggml_type {
GGML_TYPE_F32 = 0, GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1, GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed // GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 (5) support has been removed // GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7, GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8, GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9, GGML_TYPE_Q8_1 = 9,
// k-quantizations GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q2_K = 10, GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q3_K = 11, GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q4_K = 12, GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q5_K = 13, GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q6_K = 14, GGML_TYPE_Q8_K = 15,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ3_XXS = 18,
@ -363,9 +363,9 @@ extern "C" {
GGML_TYPE_IQ3_S = 21, GGML_TYPE_IQ3_S = 21,
GGML_TYPE_IQ2_S = 22, GGML_TYPE_IQ2_S = 22,
GGML_TYPE_IQ4_XS = 23, GGML_TYPE_IQ4_XS = 23,
GGML_TYPE_I8, GGML_TYPE_I8 = 24,
GGML_TYPE_I16, GGML_TYPE_I16 = 25,
GGML_TYPE_I32, GGML_TYPE_I32 = 26,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
@ -383,20 +383,20 @@ extern "C" {
// model file types // model file types
enum ggml_ftype { enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1, GGML_FTYPE_UNKNOWN = -1,
GGML_FTYPE_ALL_F32 = 0, GGML_FTYPE_ALL_F32 = 0,
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors

View File

@ -661,6 +661,9 @@ class GGMLQuantizationType(IntEnum):
IQ3_S = 21 IQ3_S = 21
IQ2_S = 22 IQ2_S = 22
IQ4_XS = 23 IQ4_XS = 23
I8 = 24
I16 = 25
I32 = 26
class GGUFEndian(IntEnum): class GGUFEndian(IntEnum):
@ -727,6 +730,9 @@ GGML_QUANT_SIZES = {
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
GGMLQuantizationType.I8: (1, 1),
GGMLQuantizationType.I16: (1, 2),
GGMLQuantizationType.I32: (1, 4),
} }

View File

@ -248,6 +248,15 @@ class GGUFReader:
elif ggml_type == GGMLQuantizationType.F16: elif ggml_type == GGMLQuantizationType.F16:
item_count = n_elems item_count = n_elems
item_type = np.float16 item_type = np.float16
elif ggml_type == GGMLQuantizationType.I8:
item_count = n_elems
item_type = np.int8
elif ggml_type == GGMLQuantizationType.I16:
item_count = n_elems
item_type = np.int16
elif ggml_type == GGMLQuantizationType.I32:
item_count = n_elems
item_type = np.int32
else: else:
item_count = n_bytes item_count = n_bytes
item_type = np.uint8 item_type = np.uint8

View File

@ -196,9 +196,6 @@ class GGUFWriter:
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
raise ValueError("Only F32 and F16 tensors are supported for now")
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
@ -207,7 +204,18 @@ class GGUFWriter:
for i in range(n_dims): for i in range(n_dims):
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None: if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 if tensor_dtype == np.float32:
dtype = GGMLQuantizationType.F32
elif tensor_dtype == np.float16:
dtype = GGMLQuantizationType.F16
elif tensor_dtype == np.int8:
dtype = GGMLQuantizationType.I8
elif tensor_dtype == np.int16:
dtype = GGMLQuantizationType.I16
elif tensor_dtype == np.int32:
dtype = GGMLQuantizationType.I32
else:
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
else: else:
dtype = raw_dtype dtype = raw_dtype
self.ti_data += self._pack("I", dtype) self.ti_data += self._pack("I", dtype)

1219
llama.cpp

File diff suppressed because it is too large Load Diff

View File

@ -234,7 +234,8 @@ extern "C" {
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
@ -377,6 +378,7 @@ extern "C" {
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
@ -664,6 +666,11 @@ extern "C" {
// Set abort callback // Set abort callback
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
// Wait until all computations are finished
// This is automatically done when using one of the functions below to obtain the computation results
// and is not necessary to call it explicitly in most cases
LLAMA_API void llama_synchronize(struct llama_context * ctx);
// Token logits obtained from the last call to llama_decode() // Token logits obtained from the last call to llama_decode()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Logits for which llama_batch.logits[i] == 0 are undefined // Logits for which llama_batch.logits[i] == 0 are undefined

View File

@ -2222,8 +2222,8 @@ static void usage(char ** argv) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
test_mode mode = MODE_TEST; test_mode mode = MODE_TEST;
const char * op_name = NULL; const char * op_name_filter = NULL;
const char * backend = NULL; const char * backend_filter = NULL;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "test") == 0) { if (strcmp(argv[i], "test") == 0) {
@ -2232,14 +2232,14 @@ int main(int argc, char ** argv) {
mode = MODE_PERF; mode = MODE_PERF;
} else if (strcmp(argv[i], "-o") == 0) { } else if (strcmp(argv[i], "-o") == 0) {
if (i + 1 < argc) { if (i + 1 < argc) {
op_name = argv[++i]; op_name_filter = argv[++i];
} else { } else {
usage(argv); usage(argv);
return 1; return 1;
} }
} else if (strcmp(argv[i], "-b") == 0) { } else if (strcmp(argv[i], "-b") == 0) {
if (i + 1 < argc) { if (i + 1 < argc) {
backend = argv[++i]; backend_filter = argv[++i];
} else { } else {
usage(argv); usage(argv);
return 1; return 1;
@ -2258,7 +2258,7 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) { for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i)); printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) { if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
printf(" Skipping\n"); printf(" Skipping\n");
n_ok++; n_ok++;
continue; continue;
@ -2266,9 +2266,17 @@ int main(int argc, char ** argv) {
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL); ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
GGML_ASSERT(backend != NULL); GGML_ASSERT(backend != NULL);
if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
printf(" Skipping CPU backend\n");
ggml_backend_free(backend);
n_ok++;
continue;
}
printf(" Backend name: %s\n", ggml_backend_name(backend)); printf(" Backend name: %s\n", ggml_backend_name(backend));
bool ok = test_backend(backend, mode, op_name); bool ok = test_backend(backend, mode, op_name_filter);
printf(" Backend %s: ", ggml_backend_name(backend)); printf(" Backend %s: ", ggml_backend_name(backend));
if (ok) { if (ok) {