Merge branch 'master' into gg/flash-attn

This commit is contained in:
Georgi Gerganov 2024-01-28 21:53:51 +02:00
commit 0ad44baf33
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
36 changed files with 85118 additions and 89 deletions

View File

@ -143,6 +143,47 @@ jobs:
cd build cd build
ctest -L main --verbose ctest -L main --verbose
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v2
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

View File

@ -1,5 +1,6 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -98,11 +99,14 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -121,8 +125,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# #
# Compile flags # Compile flags
# #
if (LLAMA_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
@ -409,6 +417,22 @@ if (LLAMA_CLBLAST)
endif() endif()
endif() endif()
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
add_compile_definitions(GGML_USE_VULKAN)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
else()
message(WARNING "Vulkan not found")
endif()
endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
@ -454,6 +478,32 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_SYCL)
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -479,10 +529,12 @@ function(get_flags CCID CCVER)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif() endif()
elseif (CCID MATCHES "Intel") elseif (CCID MATCHES "Intel")
# enable max optimization level when using Intel compiler if (NOT LLAMA_SYCL)
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) # enable max optimization level when using Intel compiler
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel) set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel)
endif()
endif() endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
@ -799,6 +851,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

View File

@ -448,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST endif # LLAMA_CLBLAST
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
OBJS += ggml-vulkan.o
ifdef LLAMA_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
endif
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)

View File

@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision - Mixed F16 / F32 precision
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support - CUDA, Metal, OpenCL, SYCL GPU backend support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@ -599,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc. You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
### Prepare Data & Run ### Prepare Data & Run
```bash ```bash

252
README_sycl.md Normal file
View File

@ -0,0 +1,252 @@
# llama.cpp for SYCL
[Background](#background)
[OS](#os)
[Intel GPU](#intel-gpu)
[Linux](#linux)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Todo](#todo)
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04|
|Windows|Ongoing| |
## Intel GPU
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
## Linux
### Setup Environment
1. Install Intel GPU driver.
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Note: re-login to enable it.
c. Check
```
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Check
```
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
```
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v
```
or
```
./examples/sycl/build.sh
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
Run without parameter:
```
./build/bin/ls-sycl-device
or
./build/bin/main
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```
./examples/sycl/run_llama2.sh
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
#### Build
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
#### Running
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
## Known Issue
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**.
## Todo
- Support to build in Windows.
- Support multiple cards.

View File

@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support # with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
``` ```

View File

@ -10,6 +10,9 @@
# # with CUDA support # # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated

View File

@ -42,6 +42,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL
#endif
int32_t get_num_physical_cores() { int32_t get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") { } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--numa") { } else if (arg == "--numa") {
@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#endif #endif // LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n"); printf(" -gan N, --grp-attn-n N\n");
@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");

View File

@ -23,6 +23,9 @@ else()
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava) add_subdirectory(llava)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(parallel) add_subdirectory(parallel)

View File

@ -562,6 +562,7 @@ struct test {
static const int build_number; static const int build_number;
static const bool cuda; static const bool cuda;
static const bool opencl; static const bool opencl;
static const bool vulkan;
static const bool metal; static const bool metal;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
@ -643,6 +644,9 @@ struct test {
if (opencl) { if (opencl) {
return "OpenCL"; return "OpenCL";
} }
if (vulkan) {
return "Vulkan";
}
if (metal) { if (metal) {
return "Metal"; return "Metal";
} }
@ -658,7 +662,7 @@ struct test {
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
@ -682,7 +686,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") { field == "avg_ns" || field == "stddev_ns") {
return INT; return INT;
} }
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
return BOOL; return BOOL;
} }
@ -710,7 +714,7 @@ struct test {
} }
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@ -738,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();

View File

@ -2099,7 +2099,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i]; std::string arg_next = argv[i];
// split string by , and / // split string by , and /
@ -2125,7 +2125,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
else if (arg == "--no-mul-mat-q" || arg == "-nommq") else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{ {
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.mul_mat_q = false; params.mul_mat_q = false;
#else #else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@ -2138,7 +2138,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});

View File

@ -0,0 +1,9 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
set(TARGET ls-sycl-device)
add_executable(${TARGET} ls-sycl-device.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

47
examples/sycl/README.md Normal file
View File

@ -0,0 +1,47 @@
# llama.cpp/example/sycl
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
## Tool
|Tool Name| Function|Status|
|-|-|-|
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
### ls-sycl-device
List all SYCL devices with ID, compute capability, max work group size, ect.
1. Build the llama.cpp for SYCL for all targets.
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. Execute
```
./build/bin/ls-sycl-device
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

20
examples/sycl/build.sh Executable file
View File

@ -0,0 +1,20 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v

View File

@ -0,0 +1,11 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#include "ggml-sycl.h"
int main(int argc, char ** argv) {
ggml_backend_sycl_print_sycl_devices();
return 0;
}

19
examples/sycl/run-llama2.sh Executable file
View File

@ -0,0 +1,19 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then
export GGML_SYCL_DEVICE=$1
else
export GGML_SYCL_DEVICE=0
fi
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
#export GGML_SYCL_DEBUG=1
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0

6
flake.lock generated
View File

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1705677747, "lastModified": 1706191920,
"narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=", "narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261", "rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@ -778,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
} }
// utils // utils
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft); static bool alloc_tensor_range(struct ggml_context * ctx,
struct ggml_tensor * first, struct ggml_tensor * last,
size_t nbytes = 0; ggml_backend_buffer_type_t buft, size_t size,
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
if (t->data == NULL && t->view_src == NULL) { ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
}
if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) { if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__); fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif #endif
return NULL; for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
}
free(buffers);
return false;
} }
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) { if (t->data == NULL) {
if (t->view_src == NULL) { if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t); ggml_tallocr_alloc(tallocr, t);
@ -826,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
ggml_tallocr_free(tallocr); ggml_tallocr_free(tallocr);
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer;
return true;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t max_size = ggml_backend_buft_get_max_size(buft);
ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;
size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size_t this_size = 0;
if (t->data == NULL && t->view_src == NULL) {
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
if (this_size > max_size) {
// tensor is too large to fit in a single buffer
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
for (size_t i = 0; i < n_buffers; i++) {
ggml_backend_buffer_free(buffers[i]);
}
free(buffers);
return NULL;
}
if ((cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
first = t;
cur_buf_size = this_size;
} else {
cur_buf_size += this_size;
}
}
// allocate remaining tensors
if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}
if (n_buffers == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer;
if (n_buffers == 1) {
buffer = buffers[0];
} else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
return buffer; return buffer;
} }

View File

@ -19,6 +19,7 @@ extern "C" {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory // check if tensor data is in host memory
@ -63,6 +64,11 @@ extern "C" {
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend // Backend
// //

View File

@ -27,6 +27,14 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
return buft->iface.get_alignment(buft); return buft->iface.get_alignment(buft);
} }
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
// get_max_size is optional, defaults to SIZE_MAX
if (buft->iface.get_max_size) {
return buft->iface.get_max_size(buft);
}
return SIZE_MAX;
}
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes // get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) { if (buft->iface.get_alloc_size) {
@ -57,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
size_t size) { size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
GGML_ASSERT(iface.get_base != NULL);
(*buffer) = (struct ggml_backend_buffer) { (*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface, /* .interface = */ iface,
/* .buft = */ buft, /* .buft = */ buft,
@ -108,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
} }
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
@ -122,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage; buffer->usage = usage;
// FIXME: add a generic callback to the buffer interface
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
ggml_backend_multi_buffer_set_usage(buffer, usage);
}
} }
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@ -171,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)); return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
} }
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
}
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -339,11 +358,21 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices(); ggml_backend_cuda_reg_devices();
#endif #endif
#ifdef GGML_USE_SYCL
extern void ggml_backend_sycl_reg_devices(void);
ggml_backend_sycl_reg_devices();
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
#endif #endif
#ifdef GGML_USE_VULKAN
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
ggml_backend_vk_reg_devices();
#endif
} }
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@ -547,6 +576,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@ -602,6 +632,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@ -758,6 +789,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
GGML_UNUSED(user_data); GGML_UNUSED(user_data);
} }
// multi-buffer buffer
struct ggml_backend_multi_buffer_context {
ggml_backend_buffer_t * buffers;
size_t n_buffers;
};
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
}
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_free(ctx->buffers[i]);
}
free(ctx->buffers);
free(ctx);
}
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_clear(ctx->buffers[i], value);
}
}
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
/* .get_name = */ ggml_backend_multi_buffer_get_name,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
/* .get_base = */ NULL,
/* .init_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .get_tensor = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_multi_buffer_clear,
/* .reset = */ NULL,
};
return multi_backend_buffer_i;
}
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
size_t total_size = 0;
for (size_t i = 0; i < n_buffers; i++) {
ctx->buffers[i] = buffers[i];
total_size += ggml_backend_buffer_get_size(buffers[i]);
}
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
}
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
}
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
}
}
// scheduler // scheduler

View File

@ -20,6 +20,7 @@ extern "C" {
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@ -36,6 +37,7 @@ extern "C" {
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@ -54,6 +56,7 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);

View File

@ -10440,6 +10440,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
@ -10715,6 +10716,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name, /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
@ -10794,6 +10796,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,

View File

@ -24,10 +24,7 @@
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define GGML_METAL_MAX_KERNELS 256
struct ggml_metal_kernel { struct ggml_metal_kernel {
id<MTLFunction> function;
id<MTLComputePipelineState> pipeline; id<MTLComputePipelineState> pipeline;
}; };
@ -162,11 +159,10 @@ struct ggml_metal_context {
id<MTLDevice> device; id<MTLDevice> device;
id<MTLCommandQueue> queue; id<MTLCommandQueue> queue;
id<MTLLibrary> library;
dispatch_queue_t d_queue; dispatch_queue_t d_queue;
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS]; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
bool support_simdgroup_reduction; bool support_simdgroup_reduction;
bool support_simdgroup_mm; bool support_simdgroup_mm;
@ -249,6 +245,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library;
// load library // load library
{ {
NSBundle * bundle = nil; NSBundle * bundle = nil;
@ -263,7 +261,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
// pre-compiled library found // pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath]; NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
@ -305,7 +303,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
//[options setFastMathEnabled:false]; //[options setFastMathEnabled:false];
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
@ -370,8 +368,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
{ {
NSError * error = nil; NSError * error = nil;
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
ctx->kernels[i].function = nil;
ctx->kernels[i].pipeline = nil; ctx->kernels[i].pipeline = nil;
} }
@ -383,13 +380,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(e, name, supported) \ #define GGML_METAL_ADD_KERNEL(e, name, supported) \
if (supported) { \ if (supported) { \
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
[metal_function release]; \
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \ (int) kernel->pipeline.threadExecutionWidth); \
if (error) { \ if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
[metal_library release]; \
return NULL; \ return NULL; \
} \ } \
} else { \ } else { \
@ -521,23 +520,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
} }
[metal_library release];
return ctx; return ctx;
} }
static void ggml_metal_free(struct ggml_metal_context * ctx) { static void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
if (ctx->kernels[i].pipeline) { [ctx->kernels[i].pipeline release];
[ctx->kernels[i].pipeline release];
}
if (ctx->kernels[i].function) {
[ctx->kernels[i].function release];
}
} }
[ctx->library release];
[ctx->queue release]; [ctx->queue release];
[ctx->device release]; [ctx->device release];
@ -2504,6 +2497,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host, /* .is_host = */ ggml_backend_metal_buffer_type_is_host,

View File

@ -2136,6 +2136,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name, /* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return from device info
/* .get_alloc_size = */ NULL, /* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
@ -2192,6 +2193,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name, /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,

15199
ggml-sycl.cpp Normal file

File diff suppressed because it is too large Load Diff

27
ggml-sycl.h Normal file
View File

@ -0,0 +1,27 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 16
#define GGML_SYCL_NAME "SYCL"
GGML_API void ggml_init_sycl(void);
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
#ifdef __cplusplus
}
#endif

61420
ggml-vulkan-shaders.hpp Normal file

File diff suppressed because it is too large Load Diff

5176
ggml-vulkan.cpp Normal file

File diff suppressed because it is too large Load Diff

34
ggml-vulkan.h Normal file
View File

@ -0,0 +1,34 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_VK_NAME "Vulkan"
GGML_API void ggml_vk_init(void);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers(void);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup(void);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

65
ggml.c
View File

@ -248,6 +248,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
#include "ggml-cuda.h" #include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h" #include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -2355,6 +2359,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_init_cublas(); ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init();
#elif defined(GGML_USE_SYCL)
ggml_init_sycl();
#endif #endif
ggml_setup_op_has_task_pass(); ggml_setup_op_has_task_pass();
@ -8130,7 +8138,7 @@ static void ggml_compute_forward_mul_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
#ifdef GGML_USE_CLBLAST #if defined(GGML_USE_CLBLAST)
if (src1->backend == GGML_BACKEND_GPU) { if (src1->backend == GGML_BACKEND_GPU) {
// TODO: OpenCL kernel support full broadcast // TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@ -10085,7 +10093,7 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) { if (ggml_compute_forward_mul_mat_use_blas(dst)) {
const int64_t ne_plane = ne01*ne00; const int64_t ne_plane = ne01*ne00;
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize); UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
@ -15010,8 +15018,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} }
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#elif defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
ggml_vk_check_results_1(params, tensor);
}
#endif
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#ifdef GGML_USE_SYCL
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
if (skip_cpu) {
return;
}
#endif // GGML_USE_SYCL
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP: case GGML_OP_DUP:
{ {
@ -17418,6 +17444,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
} }
#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
#endif
const int n_threads = cplan->n_threads; const int n_threads = cplan->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
@ -17469,6 +17506,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
} }
#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup();
#endif
// performance stats (graph) // performance stats (graph)
{ {
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@ -20603,7 +20644,7 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;
@ -20626,8 +20667,24 @@ int ggml_cpu_has_clblast(void) {
#endif #endif
} }
int ggml_cpu_has_vulkan(void) {
#if defined(GGML_USE_VULKAN)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
} }
int ggml_cpu_has_sse3(void) { int ggml_cpu_has_sse3(void) {

2
ggml.h
View File

@ -2281,9 +2281,11 @@ extern "C" {
GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
// //

2362
ggml_vk_generate_shaders.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,10 @@
# include "ggml-cuda.h" # include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h" # include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -52,6 +56,7 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <cassert> #include <cassert>
#include <cfloat>
#include <cinttypes> #include <cinttypes>
#include <climits> #include <climits>
#include <cmath> #include <cmath>
@ -1279,8 +1284,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
} }
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM) #elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type(); buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
if (host_buffer) {
buft = ggml_backend_vk_host_buffer_type();
}
#endif #endif
if (buft == nullptr) { if (buft == nullptr) {
@ -1298,6 +1309,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUBLAS)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type();
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type(); buft = ggml_backend_opencl_buffer_type();
#endif #endif
@ -6875,7 +6890,7 @@ static int llama_decode_internal(
} }
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
if (ggml_cpu_has_cublas() && fully_offloaded) { if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
n_threads = 1; n_threads = 1;
} }
@ -8282,21 +8297,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
return; return;
} }
llama_sample_softmax(ctx, candidates);
const int64_t t_start_sample_us = ggml_time_us(); const int64_t t_start_sample_us = ggml_time_us();
float scale = candidates->data[0].p; // scale by max prob bool min_p_applied = false;
size_t i = 1; // first token always matches
for (; i < candidates->size; ++i) { // if the candidates aren't sorted, try the unsorted implementation first
if (candidates->data[i].p < p * scale && i >= min_keep) { if (!candidates->sorted) {
break; // prob too small std::vector<llama_token_data> filtered_tokens;
float max_logit = -FLT_MAX;
for (size_t i = 0; i < candidates->size; ++i) {
max_logit = std::max(max_logit, candidates->data[i].logit);
}
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
for (size_t i = 0; i < candidates->size; ++i) {
if (candidates->data[i].logit >= min_logit) {
filtered_tokens.push_back(candidates->data[i]);
}
}
// if we have enough values the operation was a success
if (filtered_tokens.size() >= min_keep) {
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
candidates->size = filtered_tokens.size();
min_p_applied = true;
} }
} }
// Resize the output vector to keep only the matching tokens // if the candidates are sorted or the unsorted implementation failed, use this implementation
candidates->size = i; if (!min_p_applied) {
// Sort the logits in descending order
if (!candidates->sorted) {
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
candidates->sorted = true;
}
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
size_t i = 1; // first token always matches
for (; i < candidates->size; ++i) {
if (candidates->data[i].logit < min_logit && i >= min_keep) {
break; // prob too small
}
}
// Resize the output vector to keep only the matching tokens
candidates->size = i;
}
if (ctx) { if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@ -10225,6 +10275,26 @@ struct llama_context * llama_new_context_with_model(
} }
} }
} }
#elif defined(GGML_USE_VULKAN)
if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_vk_init();
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#endif #endif
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {

View File

@ -6,6 +6,9 @@
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
#else #else
#define LLAMA_MAX_DEVICES 1 #define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4 #define LLAMA_SESSION_VERSION 4
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif

View File

@ -1 +1 @@
c2448f88d17395452a587d0176d19ed87e0f7ce1 f2a9472b23cf27e672ed70a2a6eb078f7b060f18

View File

@ -239,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j) #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k) #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
#ifdef GGML_USE_SYCL
static bool inline _isinf(float f) {
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
}
#else
static bool inline _isinf(float f) { return std::isinf(f); }
#endif
// accept FLT_MAX as infinity // accept FLT_MAX as infinity
static bool isinf_or_max(float f) { static bool isinf_or_max(float f) {
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX; return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
} }
static bool ggml_is_view_op(enum ggml_op op) { static bool ggml_is_view_op(enum ggml_op op) {