ggml : build backends as libraries (#10256)

* ggml : build backends as libraries

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: R0CKSTAR <xiaodong.ye@mthreads.com>
This commit is contained in:
Diego Devesa 2024-11-14 18:04:35 +01:00 committed by GitHub
parent 4a8ccb37ad
commit ae8de6d50a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
191 changed files with 17541 additions and 16879 deletions

View File

@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \ fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) cmake --build build --config Release --target llama-cli -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/lib/ /
COPY --from=build /app/build/src/libllama.so /libllama.so COPY --from=build /app/build/bin/llama-cli /
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/llama-cli" ]

View File

@ -16,15 +16,16 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) cmake --build build --config Release --target llama-cli -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/lib/ /
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/llama-cli" ]

View File

@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \ fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) cmake --build build --config Release --target llama-server -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/lib/ /
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine

View File

@ -16,15 +16,16 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) cmake --build build --config Release --target llama-server -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/lib/ /
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine

View File

@ -126,9 +126,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
}; };
postPatch = '' postPatch = ''
substituteInPlace ./ggml/src/ggml-metal.m \ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal.m \ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
''; '';
@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda) (cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm) (cmakeBool "GGML_HIP" useRocm)
(cmakeBool "GGML_METAL" useMetalKit) (cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan) (cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic) (cmakeBool "GGML_STATIC" enableStatic)

View File

@ -405,13 +405,13 @@ jobs:
- name: Build with native CMake HIP support - name: Build with native CMake HIP support
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support - name: Build with legacy HIP support
id: cmake_build_legacy_hip id: cmake_build_legacy_hip
run: | run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc) cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
@ -747,7 +747,7 @@ jobs:
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }} if: ${{ matrix.build == 'kompute-x64' }}
run: | run: |
git submodule update --init ggml/src/kompute git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS - name: Download OpenBLAS
id: get_openblas id: get_openblas
@ -1014,7 +1014,7 @@ jobs:
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
windows-latest-cmake-hip-release: windows-latest-cmake-hip-release:
@ -1050,7 +1050,7 @@ jobs:
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\" md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "kompute"] [submodule "kompute"]
path = ggml/src/kompute path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git url = https://github.com/nomic-ai/kompute.git

View File

@ -140,7 +140,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
# At the moment some compile definitions are placed within the ggml/src # At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by # directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config # determining _precisely_ which defines are necessary for the llama-config

267
Makefile
View File

@ -523,65 +523,54 @@ ifndef GGML_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel # `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
MK_LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
endif endif
endif # GGML_NO_ACCELERATE endif # GGML_NO_ACCELERATE
ifdef GGML_MUSA
CC := clang
CXX := clang++
GGML_CUDA := 1
MK_CPPFLAGS += -DGGML_USE_MUSA
endif
ifndef GGML_NO_OPENMP ifndef GGML_NO_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp
ifdef GGML_MUSA
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
endif # GGML_MUSA
endif # GGML_NO_OPENMP endif # GGML_NO_OPENMP
ifdef GGML_OPENBLAS ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas)
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_OPENBLAS endif # GGML_OPENBLAS
ifdef GGML_OPENBLAS64 ifdef GGML_OPENBLAS64
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
MK_LDFLAGS += $(shell pkg-config --libs openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64)
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_OPENBLAS64 endif # GGML_OPENBLAS64
ifdef GGML_BLIS ifdef GGML_BLIS
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib MK_LDFLAGS += -lblis -L/usr/local/lib
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_BLIS endif # GGML_BLIS
ifdef GGML_NVPL ifdef GGML_NVPL
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_NVPL endif # GGML_NVPL
ifndef GGML_NO_LLAMAFILE ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o OBJ_GGML += ggml/src/ggml-cpu/llamafile/sgemm.o
endif endif
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o OBJ_GGML += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
endif endif
ifdef GGML_RPC ifdef GGML_RPC
@ -601,29 +590,17 @@ else
endif # GGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS
ifdef GGML_CUDA ifdef GGML_CUDA
ifdef GGML_MUSA ifneq ('', '$(wildcard /opt/cuda)')
ifneq ('', '$(wildcard /opt/musa)') CUDA_PATH ?= /opt/cuda
CUDA_PATH ?= /opt/musa
else
CUDA_PATH ?= /usr/local/musa
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
else else
ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /usr/local/cuda
CUDA_PATH ?= /opt/cuda endif
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML += $(OBJ_CUDA_TMPL)
@ -631,11 +608,9 @@ ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS endif # LLAMA_FATAL_WARNINGS
ifndef GGML_MUSA
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
endif # GGML_MUSA
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
@ -648,11 +623,7 @@ endif # GGML_CUDA_DEBUG
ifdef GGML_CUDA_NVCC ifdef GGML_CUDA_NVCC
NVCC = $(CCACHE) $(GGML_CUDA_NVCC) NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
else else
ifdef GGML_MUSA NVCC = $(CCACHE) nvcc
NVCC = $(CCACHE) mcc
else
NVCC = $(CCACHE) nvcc
endif # GGML_MUSA
endif # GGML_CUDA_NVCC endif # GGML_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
@ -724,15 +695,9 @@ define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
else else
ifdef GGML_MUSA
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
endif # GGML_MUSA
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/%.o: \
@ -742,8 +707,8 @@ ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-cuda/common.cuh ggml/src/ggml-cuda/common.cuh
$(NVCC_COMPILE) $(NVCC_COMPILE)
ggml/src/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda.cu \ ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h \ ggml/include/ggml-backend.h \
@ -819,7 +784,7 @@ ifdef GGML_HIPBLAS
GGML_CUDA_MMV_Y ?= 1 GGML_CUDA_MMV_Y ?= 1
GGML_CUDA_KQUANTS_ITER ?= 2 GGML_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
ifdef GGML_HIP_UMA ifdef GGML_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA
@ -852,12 +817,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML += $(OBJ_CUDA_TMPL)
ggml/src/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda.cu \ ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h \ ggml/include/ggml-backend.h \
@ -874,10 +839,111 @@ ggml/src/ggml-cuda/%.o: \
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS endif # GGML_HIPBLAS
ifdef GGML_MUSA
ifeq ($(wildcard /opt/musa),)
MUSA_PATH ?= /usr/local/musa
else
MUSA_PATH ?= /opt/musa
endif
MTGPU_TARGETS ?= mp_21 mp_22
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
MK_LDFLAGS += -lmusa -lmusart -lmublas
ifndef GGML_NO_OPENMP
# For Ubuntu Focal
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
# For Ubuntu Jammy
MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
MK_LDFLAGS += -L/usr/lib/llvm-14/lib
endif # GGML_NO_OPENMP
CC := $(MUSA_PATH)/bin/clang
CXX := $(MUSA_PATH)/bin/clang++
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
ifdef GGML_CUDA_FORCE_DMMV
MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
ifdef GGML_CUDA_FORCE_CUBLAS
MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_DMMV_X
MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
else
MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
endif # GGML_CUDA_DMMV_X
ifdef GGML_CUDA_MMV_Y
MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
else
MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
endif # GGML_CUDA_MMV_Y
ifdef GGML_CUDA_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_F16
ifdef GGML_CUDA_DMMV_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_DMMV_F16
ifdef GGML_CUDA_KQUANTS_ITER
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
else
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
else
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
ifdef GGML_CUDA_NO_PEER_COPY
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY
ifdef GGML_CUDA_FA_ALL_QUANTS
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # GGML_CUDA_FA_ALL_QUANTS
OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL)
ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \
ggml/include/ggml.h \
ggml/include/ggml-backend.h \
ggml/src/ggml-backend-impl.h \
ggml/src/ggml-common.h \
$(wildcard ggml/src/ggml-cuda/*.cuh)
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-cuda/%.cu \
ggml/include/ggml.h \
ggml/src/ggml-common.h \
ggml/src/ggml-cuda/common.cuh
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
endif # GGML_MUSA
ifdef GGML_METAL ifdef GGML_METAL
MK_CPPFLAGS += -DGGML_USE_METAL MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJ_GGML += ggml/src/ggml-metal.o OBJ_GGML += ggml/src/ggml-metal/ggml-metal.o
ifdef GGML_METAL_USE_BF16 ifdef GGML_METAL_USE_BF16
MK_CPPFLAGS += -DGGML_METAL_USE_BF16 MK_CPPFLAGS += -DGGML_METAL_USE_BF16
@ -887,30 +953,30 @@ ifdef GGML_METAL_NDEBUG
endif endif
ifdef GGML_METAL_EMBED_LIBRARY ifdef GGML_METAL_EMBED_LIBRARY
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
OBJ_GGML += ggml/src/ggml-metal-embed.o OBJ_GGML += ggml/src/ggml-metal-embed.o
endif endif
endif # GGML_METAL endif # GGML_METAL
ifdef GGML_METAL ifdef GGML_METAL
ggml/src/ggml-metal.o: \ ggml/src/ggml-metal/ggml-metal.o: \
ggml/src/ggml-metal.m \ ggml/src/ggml-metal/ggml-metal.m \
ggml/include/ggml-metal.h \ ggml/include/ggml-metal.h \
ggml/include/ggml.h ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ifdef GGML_METAL_EMBED_LIBRARY ifdef GGML_METAL_EMBED_LIBRARY
ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-metal-embed.o: \
ggml/src/ggml-metal.metal \ ggml/src/ggml-metal/ggml-metal.metal \
ggml/src/ggml-common.h ggml/src/ggml-common.h
@echo "Embedding Metal library" @echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp -d)) $(eval TEMP_ASSEMBLY=$(shell mktemp -d))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
@rmdir ${TEMP_ASSEMBLY} @rmdir ${TEMP_ASSEMBLY}
@ -919,11 +985,16 @@ endif # GGML_METAL
OBJ_GGML += \ OBJ_GGML += \
ggml/src/ggml.o \ ggml/src/ggml.o \
ggml/src/ggml-cpu.o \ ggml/src/ggml-aarch64.o \
ggml/src/ggml-alloc.o \ ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \ ggml/src/ggml-backend.o \
ggml/src/ggml-backend-reg.o \
ggml/src/ggml-quants.o \ ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o ggml/src/ggml-threading.o \
ggml/src/ggml-cpu/ggml-cpu.o \
ggml/src/ggml-cpu/ggml-cpu-cpp.o \
ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
ggml/src/ggml-cpu/ggml-cpu-quants.o
OBJ_LLAMA = \ OBJ_LLAMA = \
src/llama.o \ src/llama.o \
@ -997,7 +1068,6 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef GGML_CUDA ifdef GGML_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifndef GGML_MUSA
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH ifndef CUDA_DOCKER_ARCH
@ -1007,7 +1077,6 @@ endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # GGML_MUSA
endif # GGML_CUDA endif # GGML_CUDA
$(info ) $(info )
@ -1051,12 +1120,23 @@ ggml/src/ggml.o: \
ggml/include/ggml.h ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-cpu.o: \ ggml/src/ggml-threading.o: \
ggml/src/ggml-cpu.c \ ggml/src/ggml-threading.cpp \
ggml/include/ggml.h
$(CXX) $(XXCFLAGS) -c $< -o $@
ggml/src/ggml-cpu/ggml-cpu.o: \
ggml/src/ggml-cpu/ggml-cpu.c \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/src/ggml-common.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml.h \
ggml/src/ggml-common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-alloc.o: \ ggml/src/ggml-alloc.o: \
ggml/src/ggml-alloc.c \ ggml/src/ggml-alloc.c \
ggml/include/ggml.h \ ggml/include/ggml.h \
@ -1084,22 +1164,22 @@ ggml/src/ggml-aarch64.o: \
ggml/src/ggml-common.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \ ggml/src/ggml-blas/ggml-blas.o: \
ggml/src/ggml-blas.cpp \ ggml/src/ggml-blas/ggml-blas.cpp \
ggml/include/ggml-blas.h ggml/include/ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ifndef GGML_NO_LLAMAFILE ifndef GGML_NO_LLAMAFILE
ggml/src/llamafile/sgemm.o: \ ggml/src/ggml-cpu/llamafile/sgemm.o: \
ggml/src/llamafile/sgemm.cpp \ ggml/src/ggml-cpu/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \ ggml/src/ggml-cpu/llamafile/sgemm.h \
ggml/include/ggml.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu
endif # GGML_NO_LLAMAFILE endif # GGML_NO_LLAMAFILE
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
ggml/src/ggml-amx.o: \ ggml/src/ggml-amx/ggml-amx.o: \
ggml/src/ggml-amx.cpp \ ggml/src/ggml-amx/ggml-amx.cpp \
ggml/include/ggml-amx.h ggml/include/ggml-amx.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
@ -1250,13 +1330,24 @@ clean:
rm -rvf ggml/*.a rm -rvf ggml/*.a
rm -rvf ggml/*.dll rm -rvf ggml/*.dll
rm -rvf ggml/*.so rm -rvf ggml/*.so
rm -vrf ggml/src/*.o rm -rvf ggml/src/*.o
rm -rvf ggml/src/llamafile/*.o
rm -rvf common/build-info.cpp rm -rvf common/build-info.cpp
rm -vrf ggml/src/ggml-metal-embed.metal rm -rvf ggml/src/ggml-cpu/*.o
rm -rvf ggml/src/ggml-cpu/llamafile/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -vrf ggml/src/ggml-blas/*.o
rm -vrf ggml/src/ggml-cann/*.o
rm -vrf ggml/src/ggml-cpu/*.o
rm -vrf ggml/src/ggml-cuda/*.o rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o rm -vrf ggml/src/ggml-hip/*.o
rm -vrf ggml/src/ggml-kompute/*.o
rm -vrf ggml/src/ggml-metal/*.o
rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-rpc/*.o
rm -vrf ggml/src/ggml-sycl/*.o
rm -vrf ggml/src/ggml-vulkan/*.o
rm -vrf ggml/src/ggml-musa/*.o
rm -rvf $(BUILD_TARGETS) rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS) rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

View File

@ -10,11 +10,16 @@ var sources = [
"src/unicode.cpp", "src/unicode.cpp",
"src/unicode-data.cpp", "src/unicode-data.cpp",
"ggml/src/ggml.c", "ggml/src/ggml.c",
"ggml/src/ggml-cpu.c", "ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c", "ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp", "ggml/src/ggml-backend.cpp",
"ggml/src/ggml-backend-reg.cpp",
"ggml/src/ggml-cpu/ggml-cpu.c",
"ggml/src/ggml-cpu/ggml-cpu.cpp",
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c", "ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
] ]
var resources: [Resource] = [] var resources: [Resource] = []
@ -22,6 +27,7 @@ var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [ var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]), .unsafeFlags(["-fno-objc-arc"]),
.headerSearchPath("ggml/src"),
// NOTE: NEW_LAPACK will required iOS version 16.4+ // NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14 // We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
@ -30,8 +36,9 @@ var cSettings: [CSetting] = [
] ]
#if canImport(Darwin) #if canImport(Darwin)
sources.append("ggml/src/ggml-metal.m") sources.append("ggml/src/ggml-common.h")
resources.append(.process("ggml/src/ggml-metal.metal")) sources.append("ggml/src/ggml-metal/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate")) linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append( cSettings.append(
contentsOf: [ contentsOf: [

View File

@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@) set(GGML_BLAS @GGML_BLAS@)
set(GGML_CUDA @GGML_CUDA@) set(GGML_CUDA @GGML_CUDA@)
set(GGML_METAL @GGML_METAL@) set(GGML_METAL @GGML_METAL@)
set(GGML_HIPBLAS @GGML_HIPBLAS@) set(GGML_HIP @GGML_HIP@)
set(GGML_ACCELERATE @GGML_ACCELERATE@) set(GGML_ACCELERATE @GGML_ACCELERATE@)
set(GGML_VULKAN @GGML_VULKAN@) set(GGML_VULKAN @GGML_VULKAN@)
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)

View File

@ -1967,18 +1967,13 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false"); fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");

View File

@ -230,7 +230,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16 && cmake --build build --config Release -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@ -247,7 +247,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
```bash ```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \ HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16 && cmake --build build -- -j 16
``` ```
@ -259,7 +259,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash ```bash
set PATH=%HIP_PATH%\bin;%PATH% set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build cmake --build build
``` ```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)

View File

@ -774,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
struct test { struct test {
static const std::string build_commit; static const std::string build_commit;
static const int build_number; static const int build_number;
static const bool cuda;
static const bool vulkan;
static const bool kompute;
static const bool metal;
static const bool sycl;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info; static const std::string cpu_info;
static const std::string gpu_info; static const std::string gpu_info;
std::string model_filename; std::string model_filename;
@ -793,7 +786,6 @@ struct test {
std::string cpu_mask; std::string cpu_mask;
bool cpu_strict; bool cpu_strict;
int poll; int poll;
bool has_rpc;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
int n_gpu_layers; int n_gpu_layers;
@ -822,7 +814,6 @@ struct test {
cpu_mask = inst.cpu_mask; cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict; cpu_strict = inst.cpu_strict;
poll = inst.poll; poll = inst.poll;
has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers; n_gpu_layers = inst.n_gpu_layers;
@ -881,8 +872,7 @@ struct test {
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", "cpu_info", "gpu_info", "backends",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "cpu_mask", "cpu_strict", "poll", "n_threads", "cpu_mask", "cpu_strict", "poll",
@ -908,8 +898,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") { field == "avg_ns" || field == "stddev_ns") {
return INT; return INT;
} }
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "f16_kv" || field == "no_kv_offload" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" || field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
@ -938,9 +927,7 @@ struct test {
} }
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), cpu_info, gpu_info, get_backend(),
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
@ -967,13 +954,6 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT; const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const std::string test::cpu_info = get_cpu_info(); const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info(); const std::string test::gpu_info = get_gpu_info();
@ -1178,7 +1158,8 @@ struct markdown_printer : public printer {
fields.emplace_back("size"); fields.emplace_back("size");
fields.emplace_back("params"); fields.emplace_back("params");
fields.emplace_back("backend"); fields.emplace_back("backend");
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
test::get_backend().find("BLAS") != std::string::npos;
if (!is_cpu_backend) { if (!is_cpu_backend) {
fields.emplace_back("n_gpu_layers"); fields.emplace_back("n_gpu_layers");
} }
@ -1268,9 +1249,6 @@ struct markdown_printer : public printer {
value = buf; value = buf;
} else if (field == "backend") { } else if (field == "backend") {
value = test::get_backend(); value = test::get_backend();
if (t.has_rpc) {
value += "+RPC";
}
} else if (field == "test") { } else if (field == "test") {
if (t.n_prompt > 0 && t.n_gen == 0) { if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);

View File

@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
} }
static void test_roundtrip_on_chunk( static void test_roundtrip_on_chunk(
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
) { ) {
if (layer->type == GGML_TYPE_F16) { if (layer->type == GGML_TYPE_F16) {
@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
if (use_reference) { if (use_reference) {
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
} else { } else {
qfns.from_float(input_scratch, quantized_scratch, chunk_size); qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
} }
qfns.to_float(quantized_scratch, output_scratch, chunk_size); qfns.to_float(quantized_scratch, output_scratch, chunk_size);
@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats // Run quantization function for a single layer and update error stats
static void test_roundtrip_on_layer( static void test_roundtrip_on_layer(
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch, const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0 std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
) { ) {
@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
int num_chunks = (nelements + chunk_size - 1)/chunk_size; int num_chunks = (nelements + chunk_size - 1)/chunk_size;
if (num_chunks < 2 || max_thread < 2) { if (num_chunks < 2 || max_thread < 2) {
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
output_scratch.data(), print_layer_stats ? layer_error : total_error); output_scratch.data(), print_layer_stats ? layer_error : total_error);
} else { } else {
auto & stats = print_layer_stats ? layer_error : total_error; auto & stats = print_layer_stats ? layer_error : total_error;
std::mutex mutex; std::mutex mutex;
uint64_t counter = 0; uint64_t counter = 0;
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
&quantized_scratch, &output_scratch, chunk_size] () { &quantized_scratch, &output_scratch, chunk_size] () {
error_stats local_stats {}; error_stats local_stats {};
while (true) { while (true) {
@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
} }
lock.unlock(); lock.unlock();
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
} }
}; };
@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue; continue;
} }
const auto * qfns = ggml_get_type_traits(type); const auto * qfns = ggml_get_type_traits(type);
if (qfns->from_float && qfns->to_float) { const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
if (qfns_cpu->from_float && qfns->to_float) {
if (params.verbose) { if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type)); printf("testing %s ...\n", ggml_type_name(type));
} }
@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
test_roundtrip_on_layer( test_roundtrip_on_layer(
layer_name, layer_name,
params.per_layer_stats, params.per_layer_stats,
*qfns, *qfns, *qfns_cpu,
params.reference, params.reference,
kv_tensor.second, kv_tensor.second,
input_scratch, input_scratch,

View File

@ -116,6 +116,7 @@ endif()
# ggml core # ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU "ggml: enable CPU backend" ON)
# 3rd party libs / backends # 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@ -141,7 +142,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
@ -238,12 +239,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER) install(TARGETS ggml PUBLIC_HEADER)
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
install(TARGETS ggml LIBRARY) install(TARGETS ggml LIBRARY)
install(TARGETS ggml-base LIBRARY)
endif() endif()
# FIXME: this should be done in the backend cmake files
if (GGML_METAL) if (GGML_METAL)
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
install( install(
FILES src/ggml-metal.metal FILES src/ggml-metal/ggml-metal.metal
PERMISSIONS PERMISSIONS
OWNER_READ OWNER_READ
OWNER_WRITE OWNER_WRITE

View File

@ -9,16 +9,16 @@ extern "C" {
#endif #endif
// buffer_type API // buffer_type API
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_amx_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -3,6 +3,20 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#ifdef GGML_BACKEND_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
# define GGML_BACKEND_API __declspec(dllexport) extern
# else
# define GGML_BACKEND_API __declspec(dllimport) extern
# endif
# else
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
# endif
#else
# define GGML_BACKEND_API extern
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif

View File

@ -9,15 +9,15 @@ extern "C" {
#endif #endif
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_blas_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float // number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations // for openblas and blis, this will also set the number of threads used for blas operations
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -34,7 +34,7 @@ extern "C" {
*/ */
#define GGML_CANN_MAX_DEVICES 16 #define GGML_CANN_MAX_DEVICES 16
GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
/** /**
* @brief Initializes the CANN backend for a specified device. * @brief Initializes the CANN backend for a specified device.
@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
* @param device The index of the device to initialize. * @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure. * @return A pointer to the initialized backend instance, or nullptr on failure.
*/ */
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device); GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
/** /**
* @brief Checks if a given backend is a CANN backend. * @brief Checks if a given backend is a CANN backend.
@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
* @param backend The backend instance to check. * @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise. * @return True if the backend is a CANN backend, false otherwise.
*/ */
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
/** /**
* @brief Retrieves the CANN buffer type for a specified device. * @brief Retrieves the CANN buffer type for a specified device.
@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
* @return A pointer to the buffer type interface for the specified device, or * @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range. * nullptr if the device index is out of range.
*/ */
GGML_API ggml_backend_buffer_type_t GGML_BACKEND_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device); ggml_backend_cann_buffer_type(int32_t device);
/** /**
@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
* *
* @return The number of CANN devices available. * @return The number of CANN devices available.
*/ */
GGML_API int32_t ggml_backend_cann_get_device_count(void); GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
/** /**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
* *
* @return A pointer to the host buffer type interface. * @return A pointer to the host buffer type interface.
*/ */
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/** /**
* @brief Retrieves the description of a specific CANN device. * @brief Retrieves the description of a specific CANN device.
@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
* @param description Pointer to a buffer where the description will be written. * @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer. * @param description_size Size of the description buffer.
*/ */
GGML_API void ggml_backend_cann_get_device_description( GGML_BACKEND_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size); int32_t device, char* description, size_t description_size);
/** /**
@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
* @param total Pointer to a variable where the total memory size will be * @param total Pointer to a variable where the total memory size will be
* stored. * stored.
*/ */
GGML_API void ggml_backend_cann_get_device_memory(int32_t device, GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free, size_t* free,
size_t* total); size_t* total);

View File

@ -54,54 +54,77 @@ extern "C" {
GGML_NUMA_STRATEGY_COUNT GGML_NUMA_STRATEGY_COUNT
}; };
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan( GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph, const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */ int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ ); struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
// TODO: move to backend interface //
GGML_API int ggml_cpu_has_neon (void); // system info
GGML_API int ggml_cpu_has_sve (void); //
GGML_API int ggml_cpu_has_matmul_int8(void);
// get the sve vector length in bytes // x86
GGML_API int ggml_cpu_get_sve_cnt(void); GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
// ARM
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
@ -115,6 +138,7 @@ extern "C" {
const void * GGML_RESTRICT y, int nr, int nc); const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits_cpu { struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_from_float_to_mat_t from_float_to_mat; ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
@ -124,25 +148,25 @@ extern "C" {
ggml_gemm_t gemm; ggml_gemm_t gemm;
}; };
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
GGML_API void ggml_cpu_init(void); GGML_BACKEND_API void ggml_cpu_init(void);
// //
// CPU backend // CPU backend
// //
GGML_API ggml_backend_t ggml_backend_cpu_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM #ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -7,7 +7,7 @@
extern "C" { extern "C" {
#endif #endif
#ifdef GGML_USE_HIPBLAS #ifdef GGML_USE_HIP
#define GGML_CUDA_NAME "ROCm" #define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS" #define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA) #elif defined(GGML_USE_MUSA)
@ -20,27 +20,27 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_cuda_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer // device buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API int ggml_backend_cuda_get_device_count(void); GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
// forward declaration // forward declaration
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -39,27 +39,27 @@ extern "C" {
// user-code should use only these functions // user-code should use only these functions
// //
GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED( GGML_DEPRECATED(
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713"); "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family // helper to check if the device supports a specific family
// ideally, the user code should be doing these checks // ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -10,18 +10,18 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16 #define GGML_RPC_MAX_SERVERS 16
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -17,32 +17,32 @@ extern "C" {
#endif #endif
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device); GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
// devide buffer // devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_backend_sycl_get_device_description(int device, GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
char *description, char *description,
size_t description_size); size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count(); GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference // SYCL doesn't support registering host memory, keep here for reference
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); // GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -10,21 +10,21 @@ extern "C" {
#define GGML_VK_NAME "Vulkan" #define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16 #define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void); GGML_BACKEND_API void ggml_vk_instance_init(void);
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API int ggml_backend_vk_get_device_count(void); GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -176,15 +176,15 @@
#ifdef GGML_SHARED #ifdef GGML_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BUILD # ifdef GGML_BUILD
# define GGML_API __declspec(dllexport) # define GGML_API __declspec(dllexport) extern
# else # else
# define GGML_API __declspec(dllimport) # define GGML_API __declspec(dllimport) extern
# endif # endif
# else # else
# define GGML_API __attribute__ ((visibility ("default"))) # define GGML_API __attribute__ ((visibility ("default"))) extern
# endif # endif
#else #else
# define GGML_API # define GGML_API extern
#endif #endif
// TODO: support for clang // TODO: support for clang
@ -1490,7 +1490,7 @@ extern "C" {
"use ggml_rope_ext_inplace instead"); "use ggml_rope_ext_inplace instead");
// compute correction dims for YaRN RoPE scaling // compute correction dims for YaRN RoPE scaling
void ggml_rope_yarn_corr_dims( GGML_API void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
@ -2384,38 +2384,6 @@ extern "C" {
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
//
// system info
//
GGML_API int ggml_cpu_has_avx (void);
GGML_API int ggml_cpu_has_avx_vnni (void);
GGML_API int ggml_cpu_has_avx2 (void);
GGML_API int ggml_cpu_has_avx512 (void);
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_metal (void);
GGML_API int ggml_cpu_has_f16c (void);
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_riscv_v (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
#ifdef __cplusplus #ifdef __cplusplus
// restrict not standard in C++ // restrict not standard in C++
#define GGML_RESTRICT #define GGML_RESTRICT
@ -2432,7 +2400,6 @@ extern "C" {
size_t type_size; size_t type_size;
bool is_quantized; bool is_quantized;
ggml_to_float_t to_float; ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_ref; ggml_from_float_t from_float_ref;
}; };

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,5 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once #pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h" #include "ggml.h"
// GGML internal header // GGML internal header
@ -12,27 +8,11 @@
extern "C" { extern "C" {
#endif #endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -0,0 +1,107 @@
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
message(STATUS "Using AMX")
file(GLOB GGML_HEADERS_AMX "*.h")
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
file(GLOB GGML_SOURCES_AMX "*.cpp")
add_library(ggml-amx
${GGML_HEADERS_AMX}
${GGML_SOURCES_AMX})
target_link_libraries(ggml-amx PRIVATE ggml-base)
target_include_directories(ggml-amx PRIVATE . ..)
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(../ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
else()
set(GGML_AMX OFF PARENT_SCOPE)
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
endif()

View File

@ -1,7 +1,8 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu-impl.h" // <immintrin.h> // hack until AMX is moved into the CPU backend
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>

View File

@ -421,9 +421,18 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
#else // if defined(__AMX_INT8__) #else // if defined(__AMX_INT8__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
return nullptr;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
GGML_UNUSED(backend);
return false;
}
ggml_backend_t ggml_backend_amx_init(void) { ggml_backend_t ggml_backend_amx_init(void) {
fprintf(stderr, "GGML is not compiled with AMX support!\n"); fprintf(stderr, "GGML is not compiled with AMX support!\n");
return ggml_backend_t{}; return nullptr;
} }
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
@ -433,4 +442,8 @@ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
GGML_UNUSED(n_threads); GGML_UNUSED(n_threads);
} }
ggml_backend_reg_t ggml_backend_amx_reg(void) {
return nullptr;
}
#endif #endif

View File

@ -496,19 +496,20 @@ inline void from_float(const float * x, char * vy, int64_t k);
template <> template <>
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) { inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
quantize_row_q8_0(x, vy, k); // FIXME: using unoptimized reference impl until moved to CPU backend
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
} }
template <> template <>
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) { inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
quantize_row_q8_1(x, vy, k); quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
} }
template <> template <>
inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) { inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
#if 1 #if 1
// TODO: this is reference impl! // TODO: this is reference impl!
quantize_row_q8_K(x, vy, k); quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
#else #else
quantize_row_q8_K_vnni(x, vy, k); quantize_row_q8_K_vnni(x, vy, k);
#endif #endif

View File

@ -0,0 +1,195 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include <cstring>
#include <vector>
// Backend registry
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
#ifdef GGML_USE_RPC
#include "ggml-rpc.h"
#endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends;
std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
register_backend(ggml_backend_cuda_reg());
#endif
#ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg());
#endif
#ifdef GGML_USE_SYCL
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
register_backend(ggml_backend_cpu_reg());
}
void register_backend(ggml_backend_reg_t reg) {
if (!reg) {
return;
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back(reg);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
}
}
void register_device(ggml_backend_dev_t device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back(device);
}
};
static ggml_backend_registry & get_reg() {
static ggml_backend_registry reg;
return reg;
}
// Internal API
void ggml_backend_register(ggml_backend_reg_t reg) {
get_reg().register_backend(reg);
}
void ggml_backend_device_register(ggml_backend_dev_t device) {
get_reg().register_device(device);
}
// Backend (reg) enumeration
size_t ggml_backend_reg_count() {
return get_reg().backends.size();
}
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count());
return get_reg().backends[index];
}
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
return reg;
}
}
return NULL;
}
// Device enumeration
size_t ggml_backend_dev_count() {
return get_reg().devices.size();
}
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
return get_reg().devices[index];
}
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
return dev;
}
}
return NULL;
}
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == type) {
return dev;
}
}
return NULL;
}
// Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, NULL);
}

View File

@ -525,197 +525,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
return reg->iface.get_proc_address(reg, name); return reg->iface.get_proc_address(reg, name);
} }
// Backend registry
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
#ifdef GGML_USE_RPC
#include "ggml-rpc.h"
#endif
#ifndef __AMX_INT8__
#undef GGML_USE_AMX
#endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
#include "ggml-cpu.h"
struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends;
std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
register_backend(ggml_backend_cuda_reg());
#endif
#ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg());
#endif
#ifdef GGML_USE_SYCL
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
register_backend(ggml_backend_cpu_reg());
}
void register_backend(ggml_backend_reg_t reg) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back(reg);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
}
}
void register_device(ggml_backend_dev_t device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back(device);
}
};
static ggml_backend_registry & get_reg() {
static ggml_backend_registry reg;
return reg;
}
// Internal API
void ggml_backend_register(ggml_backend_reg_t reg) {
get_reg().register_backend(reg);
}
void ggml_backend_device_register(ggml_backend_dev_t device) {
get_reg().register_device(device);
}
// Backend (reg) enumeration
size_t ggml_backend_reg_count() {
return get_reg().backends.size();
}
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count());
return get_reg().backends[index];
}
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
return reg;
}
}
return NULL;
}
// Device enumeration
size_t ggml_backend_dev_count() {
return get_reg().devices.size();
}
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
return get_reg().devices[index];
}
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
return dev;
}
}
return NULL;
}
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == type) {
return dev;
}
}
return NULL;
}
// Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) {
return NULL;
}
return ggml_backend_dev_init(dev, NULL);
}
// multi-buffer buffer // multi-buffer buffer
struct ggml_backend_multi_buffer_context { struct ggml_backend_multi_buffer_context {
@ -1640,7 +1449,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
bool parallel) { bool parallel) {
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
@ -2036,17 +1845,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
return true; return true;
} }
#include "ggml-backend.h"
#include "ggml-backend-impl.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include <cctype>
#include <string>
// ggml-backend interface
// CPU backend - buffer // CPU backend - buffer
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
@ -2120,7 +1918,9 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .reset = */ NULL, /* .reset = */ NULL,
}; };
// CPU backend - buffer type // CPU backend buffer type
// this buffer type is defined here to make it available to all backends
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU"; return "CPU";
@ -2161,7 +1961,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -2184,478 +1984,14 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_cpu_buffer_type; return &ggml_backend_cpu_buffer_type;
} }
#ifdef GGML_USE_CPU_HBM
// buffer type HBM
#include <hbwmalloc.h>
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU_HBM";
GGML_UNUSED(buft);
}
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
};
return &ggml_backend_cpu_buffer_type_hbm;
}
#endif
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t bufts[] = {
#ifdef GGML_USE_CPU_HBM
ggml_backend_cpu_hbm_buffer_type(),
#endif
NULL
};
return bufts;
GGML_UNUSED(device);
}
// CPU backend - backend (stream)
struct ggml_backend_cpu_context {
int n_threads;
ggml_threadpool_t threadpool;
uint8_t * work_data;
size_t work_size;
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
return "CPU";
GGML_UNUSED(backend);
}
static void ggml_backend_cpu_free(ggml_backend_t backend) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
delete[] cpu_ctx->work_data;
delete cpu_ctx;
delete backend;
}
struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
struct ggml_cgraph cgraph;
};
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
if (cpu_plan->cplan.work_data == NULL) {
delete cpu_plan;
return NULL;
}
}
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return cpu_plan;
}
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
delete[] cpu_plan->cplan.work_data;
delete cpu_plan;
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
delete[] cpu_ctx->work_data;
cpu_ctx->work_data = new uint8_t[cplan.work_size];
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
}
static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .get_name = */ ggml_backend_cpu_get_name,
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_cpu_guid(void) {
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
return &guid;
}
ggml_backend_t ggml_backend_cpu_init(void) {
// initialize CPU backend now to avoid slowing the first graph computation
ggml_cpu_init();
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
if (ctx == NULL) {
return NULL;
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;
ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
/* .interface = */ ggml_backend_cpu_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ ctx,
};
if (cpu_backend == NULL) {
delete ctx;
return NULL;
}
return cpu_backend;
}
bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
}
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->n_threads = n_threads;
}
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
if (ctx->threadpool && ctx->threadpool != threadpool) {
// already had a different threadpool, pause/suspend it before switching
ggml_threadpool_pause(ctx->threadpool);
}
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
} }
// CPU backend - device
struct ggml_backend_cpu_device_context {
std::string description = "CPU";
ggml_backend_cpu_device_context() {
#ifdef __APPLE__
size_t len = 0;
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
description.resize(len);
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
}
#elif defined(__linux__)
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
description = p;
break;
}
}
}
fclose(f);
}
#elif defined(_WIN32)
HKEY hKey;
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0,
KEY_READ,
&hKey) == ERROR_SUCCESS) {
DWORD cpu_brand_size = 0;
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
NULL,
&cpu_brand_size) == ERROR_SUCCESS) {
description.resize(cpu_brand_size);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
(LPBYTE)&description[0], // NOLINT
&cpu_brand_size) == ERROR_SUCCESS) {
if (description.find('\0') != std::string::npos) {
description.resize(description.find('\0'));
}
}
}
RegCloseKey(hKey);
}
#endif
}
};
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
return "CPU";
GGML_UNUSED(dev);
}
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU;
GGML_UNUSED(dev);
}
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_cpu_device_get_name(dev);
props->description = ggml_backend_cpu_device_get_description(dev);
props->type = ggml_backend_cpu_device_get_type(dev);
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
}
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
switch (op->op) {
case GGML_OP_CPY:
return
op->type != GGML_TYPE_IQ2_XXS &&
op->type != GGML_TYPE_IQ2_XS &&
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
default:
return true;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .get_name = */ ggml_backend_cpu_device_get_name,
/* .get_description = */ ggml_backend_cpu_device_get_description,
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
/* .get_type = */ ggml_backend_cpu_device_get_type,
/* .get_props = */ ggml_backend_cpu_device_get_props,
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// CPU backend - backend (reg)
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
return "CPU";
GGML_UNUSED(reg);
}
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_cpu_device_context ctx;
static ggml_backend_device ggml_backend_cpu_device = {
/* .iface = */ ggml_backend_cpu_device_i,
/* .reg = */ reg,
/* .context = */ &ctx,
};
return &ggml_backend_cpu_device;
}
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_cpu_set_n_threads;
}
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}
return NULL;
GGML_UNUSED(reg);
}
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
/* .get_name = */ ggml_backend_cpu_reg_get_name,
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
/* .get_device = */ ggml_backend_cpu_reg_get_device,
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
};
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_cpu_reg;
}

View File

@ -0,0 +1,91 @@
if (GGML_STATIC)
set(BLA_STATIC ON)
endif()
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
# set(BLA_SIZEOF_INTEGER 8)
#endif()
set(BLA_VENDOR ${GGML_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
add_library(ggml-blas
ggml-blas.cpp
)
target_link_libraries(ggml-blas PRIVATE ggml-base)
target_include_directories(ggml-blas PRIVATE . ..)
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS blas)
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS openblas)
endif()
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
add_compile_definitions(GGML_BLAS_USE_BLIS)
pkg_check_modules(DepBLAS blis)
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS blas-atlas)
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS flexiblas_api)
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
add_compile_definitions(GGML_BLAS_USE_MKL)
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl)
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
#add_compile_options(${BLAS_LINKER_FLAGS})
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()
message(ERROR "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct GGML_BLAS_VENDOR")
endif()

View File

@ -6,7 +6,7 @@
#include <vector> #include <vector>
#include <cstring> #include <cstring>
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_BLAS_USE_ACCELERATE)
# include <Accelerate/Accelerate.h> # include <Accelerate/Accelerate.h>
#elif defined(GGML_BLAS_USE_MKL) #elif defined(GGML_BLAS_USE_MKL)
# include <mkl.h> # include <mkl.h>
@ -320,7 +320,7 @@ static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
} }
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) { static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_BLAS_USE_ACCELERATE)
return "Accelerate"; return "Accelerate";
#elif defined(GGML_BLAS_USE_MKL) #elif defined(GGML_BLAS_USE_MKL)
return "MKL"; return "MKL";

View File

@ -0,0 +1,46 @@
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()
if (CANN_INSTALL_DIR)
# Only Support Linux.
if (NOT UNIX)
message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
endif()
# Supported platforms: x86-64, arm64
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
endif()
# Set header and libs
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)
add_subdirectory(kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
file(GLOB GGML_SOURCES_CANN "*.cpp")
add_library(ggml-cann ${GGML_SOURCES_CANN})
target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
else()
message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
endif()

View File

@ -0,0 +1,244 @@
add_library(ggml-cpu
ggml-cpu.c
ggml-cpu.cpp
ggml-cpu-aarch64.c
ggml-cpu-aarch64.h
ggml-cpu-quants.c
ggml-cpu-quants.h
)
target_link_libraries(ggml-cpu PRIVATE ggml-base)
target_include_directories(ggml-cpu PRIVATE . ..)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
else()
message(WARNING "Accelerate framework not found")
endif()
endif()
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP)
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
# FIXME: should be replaced with a compiler id check
#if (GGML_MUSA)
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
#endif()
else()
message(WARNING "OpenMP not found")
endif()
endif()
if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE)
target_sources(ggml-cpu PRIVATE
llamafile/sgemm.cpp
llamafile/sgemm.h)
endif()
if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)
message(STATUS "Using memkind for CPU HBM")
add_compile_definitions(GGML_USE_CPU_HBM)
target_link_libraries(ggml-cpu PUBLIC memkind)
endif()
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
(NOT CMAKE_OSX_ARCHITECTURES AND
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
message(STATUS "ARM detected")
if (MSVC)
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
# Android armeabi-v7a
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
else()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
if (GGML_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
message(STATUS "x86 detected")
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
OUTPUT_VARIABLE POWER10_M)
string(FIND ${POWER10_M} "POWER10" substring_index)
if(${substring_index} GREATER_EQUAL 0)
list(APPEND ARCH_FLAGS -mcpu=power10)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (GGML_LASX)
list(APPEND ARCH_FLAGS -mlasx)
endif()
if (GGML_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
else()
message(STATUS "Unknown architecture")
endif()
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
endif()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
#pragma once
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

View File

@ -27,80 +27,6 @@ extern "C" {
#endif #endif
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__ #ifndef __FMA__
@ -388,28 +314,6 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif // defined(__ARM_NEON) #endif // defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(_MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
#include <wasm_simd128.h> #include <wasm_simd128.h>
#else #else
@ -462,153 +366,6 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
} }
#endif #endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML CPU internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
#ifdef __cplusplus
}
#endif

View File

@ -1,13 +1,15 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
#define _USE_MATH_DEFINES // For M_PI on MSVC #define _USE_MATH_DEFINES // For M_PI on MSVC
#include "ggml-aarch64.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
#include "ggml.h" #include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
@ -42,7 +44,7 @@
#endif #endif
#ifdef GGML_USE_LLAMAFILE #ifdef GGML_USE_LLAMAFILE
#include <llamafile/sgemm.h> #include "llamafile/sgemm.h"
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
@ -104,9 +106,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
// precomputed quick gelu table for f16 (128 KB) // precomputed quick gelu table for f16 (128 KB)
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];
#if defined(__ARM_ARCH) #if defined(__ARM_ARCH)
struct ggml_arm_arch_features_type { struct ggml_arm_arch_features_type {
int has_neon; int has_neon;
@ -261,11 +260,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_F16] = { [GGML_TYPE_F16] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16, .vec_dot_type = GGML_TYPE_F16,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q4_0] = { [GGML_TYPE_Q4_0] = {
.from_float = quantize_row_q4_0,
.vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8) #if defined (__ARM_FEATURE_MATMUL_INT8)
@ -275,6 +276,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
#endif #endif
}, },
[GGML_TYPE_Q4_1] = { [GGML_TYPE_Q4_1] = {
.from_float = quantize_row_q4_1,
.vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
#if defined (__ARM_FEATURE_MATMUL_INT8) #if defined (__ARM_FEATURE_MATMUL_INT8)
@ -283,27 +285,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.nrows = 1, .nrows = 1,
#endif #endif
}, },
[4] = { // GGML_TYPE_Q4_2
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
},
[5] = { // GGML_TYPE_Q4_3
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
},
[GGML_TYPE_Q5_0] = { [GGML_TYPE_Q5_0] = {
.from_float = quantize_row_q5_0,
.vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot = ggml_vec_dot_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q5_1] = { [GGML_TYPE_Q5_1] = {
.from_float = quantize_row_q5_1,
.vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot = ggml_vec_dot_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q8_0] = { [GGML_TYPE_Q8_0] = {
.from_float = quantize_row_q8_0,
.from_float_to_mat = quantize_mat_q8_0, .from_float_to_mat = quantize_mat_q8_0,
.vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
@ -314,85 +309,106 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
#endif #endif
}, },
[GGML_TYPE_Q8_1] = { [GGML_TYPE_Q8_1] = {
.from_float = quantize_row_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q2_K] = { [GGML_TYPE_Q2_K] = {
.from_float = quantize_row_q2_K,
.vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot = ggml_vec_dot_q2_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q3_K] = { [GGML_TYPE_Q3_K] = {
.from_float = quantize_row_q3_K,
.vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot = ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q4_K] = { [GGML_TYPE_Q4_K] = {
.from_float = quantize_row_q4_K,
.vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q5_K] = { [GGML_TYPE_Q5_K] = {
.from_float = quantize_row_q5_K,
.vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot = ggml_vec_dot_q5_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q6_K] = { [GGML_TYPE_Q6_K] = {
.from_float = quantize_row_q6_K,
.vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot = ggml_vec_dot_q6_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ2_XXS] = { [GGML_TYPE_IQ2_XXS] = {
.from_float = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ2_XS] = { [GGML_TYPE_IQ2_XS] = {
.from_float = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ3_XXS] = { [GGML_TYPE_IQ3_XXS] = {
// NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
//.from_float = quantize_row_iq3_xxs,
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ3_S] = { [GGML_TYPE_IQ3_S] = {
//.from_float = quantize_row_iq3_s,
.vec_dot = ggml_vec_dot_iq3_s_q8_K, .vec_dot = ggml_vec_dot_iq3_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ2_S] = { [GGML_TYPE_IQ2_S] = {
//.from_float = quantize_row_iq2_s,
.vec_dot = ggml_vec_dot_iq2_s_q8_K, .vec_dot = ggml_vec_dot_iq2_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ1_S] = { [GGML_TYPE_IQ1_S] = {
.from_float = NULL,
.vec_dot = ggml_vec_dot_iq1_s_q8_K, .vec_dot = ggml_vec_dot_iq1_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ1_M] = { [GGML_TYPE_IQ1_M] = {
.from_float = NULL,
.vec_dot = ggml_vec_dot_iq1_m_q8_K, .vec_dot = ggml_vec_dot_iq1_m_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ4_NL] = { [GGML_TYPE_IQ4_NL] = {
.from_float = quantize_row_iq4_nl,
.vec_dot = ggml_vec_dot_iq4_nl_q8_0, .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ4_XS] = { [GGML_TYPE_IQ4_XS] = {
.from_float = quantize_row_iq4_xs,
.vec_dot = ggml_vec_dot_iq4_xs_q8_K, .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q8_K] = {
.from_float = quantize_row_q8_K,
},
[GGML_TYPE_BF16] = { [GGML_TYPE_BF16] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16, .vec_dot_type = GGML_TYPE_BF16,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q4_0_4_4] = { [GGML_TYPE_Q4_0_4_4] = {
.from_float = NULL,
.vec_dot = NULL, .vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1, .nrows = 1,
@ -401,6 +417,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.gemm = ggml_gemm_q4_0_4x4_q8_0, .gemm = ggml_gemm_q4_0_4x4_q8_0,
}, },
[GGML_TYPE_Q4_0_4_8] = { [GGML_TYPE_Q4_0_4_8] = {
.from_float = NULL,
.vec_dot = NULL, .vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1, .nrows = 1,
@ -409,6 +426,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.gemm = ggml_gemm_q4_0_4x8_q8_0, .gemm = ggml_gemm_q4_0_4x8_q8_0,
}, },
[GGML_TYPE_Q4_0_8_8] = { [GGML_TYPE_Q4_0_8_8] = {
.from_float = NULL,
.vec_dot = NULL, .vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1, .nrows = 1,
@ -417,11 +435,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.gemm = ggml_gemm_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0,
}, },
[GGML_TYPE_TQ1_0] = { [GGML_TYPE_TQ1_0] = {
.from_float = quantize_row_tq1_0,
.vec_dot = ggml_vec_dot_tq1_0_q8_K, .vec_dot = ggml_vec_dot_tq1_0_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_TQ2_0] = { [GGML_TYPE_TQ2_0] = {
.from_float = quantize_row_tq2_0,
.vec_dot = ggml_vec_dot_tq2_0_q8_K, .vec_dot = ggml_vec_dot_tq2_0_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
@ -2250,22 +2270,7 @@ struct ggml_state {
struct ggml_numa_nodes numa; struct ggml_numa_nodes numa;
}; };
// global state
static struct ggml_state g_state = {0}; static struct ggml_state g_state = {0};
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
// TODO: move to threading file
// critical section via spin lock
void ggml_critical_section_start(void) {
while (atomic_flag_test_and_set(&g_state_critical)) {
// spin
sched_yield();
}
}
void ggml_critical_section_end(void) {
atomic_flag_clear(&g_state_critical);
}
static void ggml_barrier(struct ggml_threadpool * tp) { static void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
@ -2997,8 +3002,8 @@ static void ggml_compute_forward_dup_f16(
id += ne00 * (ne01 - ir1); id += ne00 * (ne01 - ir1);
} }
} }
} else if (ggml_get_type_traits(dst->type)->from_float) { } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
size_t id = 0; size_t id = 0;
@ -3278,8 +3283,8 @@ static void ggml_compute_forward_dup_bf16(
id += ne00 * (ne01 - ir1); id += ne00 * (ne01 - ir1);
} }
} }
} else if (ggml_get_type_traits(dst->type)->from_float) { } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
size_t id = 0; size_t id = 0;
@ -3594,8 +3599,8 @@ static void ggml_compute_forward_dup_f32(
id += rs * (ne01 - ir1); id += rs * (ne01 - ir1);
} }
} }
} else if (ggml_get_type_traits(dst->type)->from_float) { } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
size_t id = 0; size_t id = 0;
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
@ -4377,7 +4382,7 @@ static void ggml_compute_forward_add_q_f32(
const enum ggml_type type = src0->type; const enum ggml_type type = src0->type;
const enum ggml_type dtype = dst->type; const enum ggml_type dtype = dst->type;
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float; ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float;
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb00 == ggml_type_size(type));
@ -4679,7 +4684,7 @@ static void ggml_compute_forward_add1_q_f32(
const enum ggml_type type = src0->type; const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float; ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float;
// we don't support permuted src0 // we don't support permuted src0
GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb00 == ggml_type_size(type));
@ -7428,7 +7433,7 @@ static void ggml_compute_forward_mul_mat(
const enum ggml_type type = src0->type; const enum ggml_type type = src0->type;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows; int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
int64_t const matmul_num_cols = type_traits_cpu[type].ncols; int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
@ -7649,7 +7654,7 @@ static void ggml_compute_forward_mul_mat_id(
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
int64_t const matmul_num_cols = type_traits_cpu[type].ncols; int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
ggml_gemv_t const gemv = type_traits_cpu[type].gemv; ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
@ -9159,12 +9164,6 @@ static void rope_yarn(
*sin_theta = sinf(theta) * mscale; *sin_theta = sinf(theta) * mscale;
} }
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
}
static void ggml_rope_cache_init( static void ggml_rope_cache_init(
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) { float * cache, float sin_sign, float theta_scale) {
@ -9181,16 +9180,6 @@ static void ggml_rope_cache_init(
} }
} }
void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
) {
// start and end correction dims
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = MAX(0, start);
dims[1] = MIN(n_dims - 1, end);
}
static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst, struct ggml_tensor * dst,
@ -10668,7 +10657,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type; enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits(k_vec_dot_type)->from_float; ggml_from_float_t const q_to_vec_dot = type_traits_cpu[k_vec_dot_type].from_float;
ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot; ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot;
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float; ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
@ -13759,6 +13748,151 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
} }
int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx_vnni(void) {
#if defined(__AVXVNNI__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx2(void) {
#if defined(__AVX2__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx512(void) {
#if defined(__AVX512F__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx512_vbmi(void) {
#if defined(__AVX512VBMI__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx512_vnni(void) {
#if defined(__AVX512VNNI__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx512_bf16(void) {
#if defined(__AVX512BF16__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_amx_int8(void) {
#if defined(__AMX_INT8__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fma(void) {
#if defined(__FMA__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_arm_fma(void) {
#if defined(__ARM_FEATURE_FMA)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_riscv_v(void) {
#if defined(__riscv_v_intrinsic)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_f16c(void) {
#if defined(__F16C__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fp16_va(void) {
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_wasm_simd(void) {
#if defined(__wasm_simd128__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_llamafile(void) {
#if defined(GGML_USE_LLAMAFILE)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_ssse3(void) {
#if defined(__SSSE3__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_neon(void) { int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) #if defined(__ARM_ARCH)
return ggml_arm_arch_features.has_neon; return ggml_arm_arch_features.has_neon;

View File

@ -0,0 +1,575 @@
#include "ggml-backend.h"
#include "ggml-backend-impl.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include <cctype>
#include <string>
#include <vector>
#if defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#endif
// ggml-backend interface
#ifdef GGML_USE_CPU_HBM
// buffer type HBM
#include <hbwmalloc.h>
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU_HBM";
GGML_UNUSED(buft);
}
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
};
return &ggml_backend_cpu_buffer_type_hbm;
}
#endif
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t bufts[] = {
#ifdef GGML_USE_CPU_HBM
ggml_backend_cpu_hbm_buffer_type(),
#endif
NULL
};
return bufts;
GGML_UNUSED(device);
}
// CPU backend - backend (stream)
struct ggml_backend_cpu_context {
int n_threads;
ggml_threadpool_t threadpool;
uint8_t * work_data;
size_t work_size;
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
return "CPU";
GGML_UNUSED(backend);
}
static void ggml_backend_cpu_free(ggml_backend_t backend) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
delete[] cpu_ctx->work_data;
delete cpu_ctx;
delete backend;
}
struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
struct ggml_cgraph cgraph;
};
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
if (cpu_plan->cplan.work_data == NULL) {
delete cpu_plan;
return NULL;
}
}
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return cpu_plan;
}
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
delete[] cpu_plan->cplan.work_data;
delete cpu_plan;
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
delete[] cpu_ctx->work_data;
cpu_ctx->work_data = new uint8_t[cplan.work_size];
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
}
static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .get_name = */ ggml_backend_cpu_get_name,
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_cpu_guid(void) {
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
return &guid;
}
ggml_backend_t ggml_backend_cpu_init(void) {
// initialize CPU backend now to avoid slowing the first graph computation
ggml_cpu_init();
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
if (ctx == NULL) {
return NULL;
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;
ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
/* .interface = */ ggml_backend_cpu_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ ctx,
};
if (cpu_backend == NULL) {
delete ctx;
return NULL;
}
return cpu_backend;
}
bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
}
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->n_threads = n_threads;
}
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
if (ctx->threadpool && ctx->threadpool != threadpool) {
// already had a different threadpool, pause/suspend it before switching
ggml_threadpool_pause(ctx->threadpool);
}
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}
// CPU backend - device
struct ggml_backend_cpu_device_context {
std::string description = "CPU";
ggml_backend_cpu_device_context() {
#ifdef __APPLE__
size_t len = 0;
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
description.resize(len);
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
}
#elif defined(__linux__)
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
description = p;
break;
}
}
}
fclose(f);
}
#elif defined(_WIN32)
HKEY hKey;
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0,
KEY_READ,
&hKey) == ERROR_SUCCESS) {
DWORD cpu_brand_size = 0;
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
NULL,
&cpu_brand_size) == ERROR_SUCCESS) {
description.resize(cpu_brand_size);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
(LPBYTE)&description[0], // NOLINT
&cpu_brand_size) == ERROR_SUCCESS) {
if (description.find('\0') != std::string::npos) {
description.resize(description.find('\0'));
}
}
}
RegCloseKey(hKey);
}
#endif
}
};
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
return "CPU";
GGML_UNUSED(dev);
}
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU;
GGML_UNUSED(dev);
}
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_cpu_device_get_name(dev);
props->description = ggml_backend_cpu_device_get_description(dev);
props->type = ggml_backend_cpu_device_get_type(dev);
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
}
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
switch (op->op) {
case GGML_OP_CPY:
return
op->type != GGML_TYPE_IQ2_XXS &&
op->type != GGML_TYPE_IQ2_XS &&
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
default:
return true;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .get_name = */ ggml_backend_cpu_device_get_name,
/* .get_description = */ ggml_backend_cpu_device_get_description,
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
/* .get_type = */ ggml_backend_cpu_device_get_type,
/* .get_props = */ ggml_backend_cpu_device_get_props,
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// CPU backend - backend (reg)
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
return "CPU";
GGML_UNUSED(reg);
}
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_cpu_device_context ctx;
static ggml_backend_device ggml_backend_cpu_device = {
/* .iface = */ ggml_backend_cpu_device_i,
/* .reg = */ reg,
/* .context = */ &ctx,
};
return &ggml_backend_cpu_device;
}
struct ggml_backend_feature {
const char * name;
const char * value;
};
// Not used yet
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" });
}
if (ggml_cpu_has_ssse3()) {
features.push_back({ "SSSE3", "1" });
}
if (ggml_cpu_has_avx()) {
features.push_back({ "AVX", "1" });
}
if (ggml_cpu_has_avx2()) {
features.push_back({ "AVX2", "1" });
}
if (ggml_cpu_has_f16c()) {
features.push_back({ "F16C", "1" });
}
if (ggml_cpu_has_fma()) {
features.push_back({ "FMA", "1" });
}
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx512()) {
features.push_back({ "AVX512", "1" });
}
if (ggml_cpu_has_avx512_vbmi()) {
features.push_back({ "AVX512_VBMI", "1" });
}
if (ggml_cpu_has_avx512_vnni()) {
features.push_back({ "AVX512_VNNI", "1" });
}
if (ggml_cpu_has_avx512_bf16()) {
features.push_back({ "AVX512_BF16", "1" });
}
if (ggml_cpu_has_amx_int8()) {
features.push_back({ "AMX_INT8", "1" });
}
if (ggml_cpu_has_neon()) {
features.push_back({ "NEON", "1" });
}
if (ggml_cpu_has_arm_fma()) {
features.push_back({ "ARM_FMA", "1" });
}
if (ggml_cpu_has_fp16_va()) {
features.push_back({ "FP16_VA", "1" });
}
if (ggml_cpu_has_matmul_int8()) {
features.push_back({ "MATMUL_INT8", "1" });
}
if (ggml_cpu_has_sve()) {
features.push_back({ "SVE", "1" });
}
if (ggml_cpu_get_sve_cnt() > 0) {
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
}
if (ggml_cpu_has_riscv_v()) {
features.push_back({ "RISCV_V", "1" });
}
if (ggml_cpu_has_vsx()) {
features.push_back({ "VSX", "1" });
}
if (ggml_cpu_has_wasm_simd()) {
features.push_back({ "WASM_SIMD", "1" });
}
if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" });
}
features.push_back({ nullptr, nullptr });
return features;
}();
return features.data();
GGML_UNUSED(reg);
}
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_cpu_set_n_threads;
}
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}
return NULL;
GGML_UNUSED(reg);
}
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
/* .get_name = */ ggml_backend_cpu_reg_get_name,
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
/* .get_device = */ ggml_backend_cpu_reg_get_device,
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
};
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_cpu_reg;
}

View File

@ -0,0 +1,165 @@
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA Toolkit found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == FP16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
file(GLOB GGML_HEADERS_CUDA "*.cuh")
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
file(GLOB GGML_SOURCES_CUDA "*.cu")
file(GLOB SRCS "template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
add_library(ggml-cuda
${GGML_HEADERS_CUDA}
${GGML_SOURCES_CUDA}
)
target_link_libraries(ggml-cuda PRIVATE ggml-base)
target_include_directories(ggml-cuda PRIVATE . ..)
# TODO: change the definitions to this target only
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
if (GGML_CUDA_GRAPHS)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
endif()
if (GGML_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (GGML_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (GGML_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
if (DEFINED GGML_CUDA_DMMV_Y)
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
endif()
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_F16)
endif()
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (GGML_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
endif()
set(CUDA_CXX_FLAGS "")
set(CUDA_FLAGS -use_fast_math)
if (GGML_FATAL_WARNINGS)
list(APPEND CUDA_FLAGS -Werror all-warnings)
endif()
if (GGML_ALL_WARNINGS AND NOT MSVC)
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
endif()
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler --version
OUTPUT_VARIABLE CUDA_CCFULLVER
ERROR_QUIET
)
if (NOT CUDA_CCFULLVER MATCHES clang)
set(CUDA_CCID "GNU")
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
OUTPUT_VARIABLE CUDA_CCVER
ERROR_QUIET
)
else()
if (CUDA_CCFULLVER MATCHES Apple)
set(CUDA_CCID "AppleClang")
else()
set(CUDA_CCID "Clang")
endif()
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
endif()
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
endif()
if (NOT MSVC)
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
endif()
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
else()
message(FATAL_ERROR "CUDA Toolkit not found")
endif()

View File

@ -6,7 +6,7 @@
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIP)
#define GGML_COMMON_DECL_HIP #define GGML_COMMON_DECL_HIP
#define GGML_COMMON_IMPL_HIP #define GGML_COMMON_IMPL_HIP
#else #else
@ -26,13 +26,13 @@
#include <string> #include <string>
#include <vector> #include <vector>
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIP)
#include "vendors/hip.h" #include "vendors/hip.h"
#elif defined(GGML_USE_MUSA) #elif defined(GGML_USE_MUSA)
#include "vendors/musa.h" #include "vendors/musa.h"
#else #else
#include "vendors/cuda.h" #include "vendors/cuda.h"
#endif // defined(GGML_USE_HIPBLAS) #endif // defined(GGML_USE_HIP)
#define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#if !defined(GGML_USE_HIPBLAS) #if !defined(GGML_USE_HIP)
static const char * cu_get_error_str(CUresult err) { static const char * cu_get_error_str(CUresult err) {
const char * err_str; const char * err_str;
cuGetErrorString(err, &err_str); cuGetErrorString(err, &err_str);
@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2; typedef float2 dfloat2;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE #define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE #define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#define FP16_MMA_AVAILABLE #define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#define INT8_MMA_AVAILABLE #define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
#define FLASH_ATTN_AVAILABLE #define FLASH_ATTN_AVAILABLE
@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
static __device__ void no_device_code( static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch); file_name, line, function_name, arch);
GGML_UNUSED(arch_list); GGML_UNUSED(arch_list);
#else #else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list); file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
__trap(); __trap();
GGML_UNUSED(no_device_code); // suppress unused function warning GGML_UNUSED(no_device_code); // suppress unused function warning
@ -176,7 +176,7 @@ static __device__ void no_device_code(
#endif // __CUDA_ARCH__ #endif // __CUDA_ARCH__
static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ int warp_reduce_sum(int x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
return __reduce_add_sync(0xffffffff, x); return __reduce_add_sync(0xffffffff, x);
#else #else
#pragma unroll #pragma unroll
@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32); x += __shfl_xor_sync(0xffffffff, x, mask, 32);
} }
return x; return x;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
} }
static __device__ __forceinline__ float warp_reduce_sum(float x) { static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int mask = 16; mask > 0; mask >>= 1) {
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
} }
return a; return a;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#else #else
NO_DEVICE_CODE; NO_DEVICE_CODE;
@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b))); return __float2half(fmaxf(__half2float(a), __half2float(b)));
#else #else
return __hmax(a, b); return __hmax(a, b);
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#else #else
NO_DEVICE_CODE; NO_DEVICE_CODE;
@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
} }
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX #if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b); return __hmax2(a, b);
@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
GGML_UNUSED(a); GGML_UNUSED(a);
GGML_UNUSED(b); GGML_UNUSED(b);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
} }
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#else #else
GGML_UNUSED(x); GGML_UNUSED(x);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
} }
#if CUDART_VERSION < CUDART_HMASK #if CUDART_VERSION < CUDART_HMASK
@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
#endif // CUDART_VERSION < CUDART_HMASK #endif // CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
c = __builtin_amdgcn_sdot4(a, b, c, false); c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3) #elif defined(RDNA3)
@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#endif #endif
return c; return c;
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= MIN_CC_DP4A #if __CUDA_ARCH__ >= MIN_CC_DP4A
return __dp4a(a, b, c); return __dp4a(a, b, c);
@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
// TODO: move to ggml-common.h // TODO: move to ggml-common.h

View File

@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
} }
template<int D, int parallel_blocks> // D == head size template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results( static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts, const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta, const float2 * __restrict__ VKQ_meta,

View File

@ -5,9 +5,9 @@
#define FATTN_KQ_STRIDE_TILE_F16 64 #define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f16( static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -5,9 +5,9 @@
#define FATTN_KQ_STRIDE_TILE_F32 32 #define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f32( static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -2,9 +2,9 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16( static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -2,9 +2,9 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f32( static __global__ void flash_attn_vec_ext_f32(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -7,9 +7,9 @@
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap> template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_ext_f16( static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -91,7 +91,7 @@ int ggml_cuda_get_device() {
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA) #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
auto res = hipMallocManaged(ptr, size); auto res = hipMallocManaged(ptr, size);
if (res == hipSuccess) { if (res == hipSuccess) {
// if error we "need" to know why... // if error we "need" to know why...
@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
return res; return res;
#else #else
#if !defined(GGML_USE_HIPBLAS) #if !defined(GGML_USE_HIP)
cudaError_t err; cudaError_t err;
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
{ {
@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
return err; return err;
#else #else
return cudaMalloc(ptr, size); return cudaMalloc(ptr, size);
#endif // !defined(GGML_USE_HIPBLAS) #endif // !defined(GGML_USE_HIP)
#endif #endif
} }
@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id; alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
} }
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
info.devices[id].vmm = !!device_vmm; info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop; cudaDeviceProp prop;
@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}; };
// pool with virtual memory // pool with virtual memory
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool { struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
} }
}; };
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
if (ggml_cuda_info().devices[device].vmm) { if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
} }
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
} }
@ -1325,7 +1325,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static cudaError_t ggml_cuda_Memcpy2DPeerAsync( static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {}; cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice; p.dstDevice = dstDevice;
@ -1339,7 +1339,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
GGML_UNUSED(dstDevice); GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice); GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
} }
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(

View File

@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 128; return 128;
#else // INT8_MMA_AVAILABLE #else // INT8_MMA_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
return 128; return 128;
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#endif // INT8_MMA_AVAILABLE #endif // INT8_MMA_AVAILABLE
} }
@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) {
} }
static constexpr __device__ int get_mmq_y_device() { static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA1) #if defined(RDNA1)
return 64; return 64;
#else #else
@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() {
#else #else
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0} #define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile(
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
template <ggml_type type, int mmq_x, int nwarps, bool need_check> template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2) #if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2) #endif // defined(RDNA3) || defined(RDNA2)
@ -2579,7 +2579,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#else #else
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
constexpr int mmq_y = get_mmq_y_device(); constexpr int mmq_y = get_mmq_y_device();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
{ {
constexpr bool fixup = false; constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup> mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
blockIdx.x, blockIdx.y, 0, ne00/qk); blockIdx.x, blockIdx.y, 0, ne00/qk);
return; return;
} }
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
const int64_t blocks_per_ne00 = ne00 / qk; const int64_t blocks_per_ne00 = ne00 / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk;
@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc); const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shmem_limit_raised[id]) { if (!shmem_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
shmem_limit_raised[id] = true; shmem_limit_raised[id] = true;
} }
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
const int nty = (args.ne01 + mmq_y - 1) / mmq_y; const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;

View File

@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
} }
template <ggml_type type, int ncols_y> template <ggml_type type, int ncols_y>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below // tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q( static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr int nwarps = 1; constexpr int nwarps = 1;
constexpr int rows_per_cuda_block = 1; constexpr int rows_per_cuda_block = 1;
#else #else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2; constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
const int row0 = rows_per_cuda_block*blockIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x;

View File

@ -1,6 +1,6 @@
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#define USE_CUB #define USE_CUB
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#ifdef USE_CUB #ifdef USE_CUB
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.

View File

@ -0,0 +1,113 @@
if (NOT EXISTS $ENV{ROCM_PATH})
if (NOT EXISTS /opt/rocm)
set(ROCM_PATH /usr)
else()
set(ROCM_PATH /opt/rocm)
endif()
else()
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if (WIN32)
set(CXX_IS_HIPCC TRUE)
else()
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
endif()
if (CXX_IS_HIPCC)
if (LINUX)
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
endif()
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
message(STATUS "HIP and hipBLAS found")
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif()
add_library(ggml-hip
${GGML_HEADERS_ROCM}
${GGML_SOURCES_ROCM})
target_link_libraries(ggml-hip PRIVATE ggml-base)
target_include_directories(ggml-hip PRIVATE . ..)
# TODO: do not use CUDA definitions for HIP
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
add_compile_definitions(GGML_USE_HIP)
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
if (GGML_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
endif()
if (GGML_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (GGML_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (GGML_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-hip PRIVATE hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
if (GGML_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)

View File

@ -3,13 +3,29 @@
// GGML internal header // GGML internal header
#include "ggml.h" #include "ggml.h"
#include <assert.h> #include <assert.h>
#include <math.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#endif
#if defined(__F16C__)
#include <immintrin.h>
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -28,13 +44,13 @@ extern "C" {
// if C99 - static_assert is noop // if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976 // ref: https://stackoverflow.com/a/53923785/4039976
#ifndef __cplusplus #ifndef __cplusplus
#ifndef static_assert #ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg) #define static_assert(cond, msg) _Static_assert(cond, msg)
#else #else
#define static_assert(cond, msg) struct global_scope_noop_trick #define static_assert(cond, msg) struct global_scope_noop_trick
#endif #endif
#endif #endif
#endif #endif
static inline int ggml_up32(int n) { static inline int ggml_up32(int n) {
@ -120,14 +136,12 @@ struct ggml_map_custom1_op_params {
void * userdata; void * userdata;
}; };
struct ggml_map_custom2_op_params { struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun; ggml_custom2_op_t fun;
int n_tasks; int n_tasks;
void * userdata; void * userdata;
}; };
struct ggml_map_custom3_op_params { struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun; ggml_custom3_op_t fun;
int n_tasks; int n_tasks;
@ -287,9 +301,249 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
void * ggml_aligned_malloc(size_t size); void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size); void ggml_aligned_free(void * ptr, size_t size);
// TODO: move to threading file // FP16 to FP32 conversion
void ggml_critical_section_start(void);
void ggml_critical_section_end(void); #if defined(__ARM_NEON)
#ifdef _MSC_VER
typedef uint16_t ggml_fp16_internal_t;
#else
typedef __fp16 ggml_fp16_internal_t;
#endif
#endif
#if defined(__ARM_NEON) && !defined(_MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#elif defined(__F16C__)
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
GGML_API float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -0,0 +1,162 @@
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
if (NOT glslc_executable)
message(FATAL_ERROR "glslc not found")
endif()
add_library(ggml-kompute
ggml-kompute.cpp
../../include/ggml-kompute.h
)
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
function(compile_shader)
set(options)
set(oneValueArgs)
set(multiValueArgs SOURCES)
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(source ${compile_shader_SOURCES})
get_filename_component(filename ${source} NAME)
set(spv_file ${filename}.spv)
add_custom_command(
OUTPUT ${spv_file}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
COMMENT "Compiling ${source} to ${spv_file}"
)
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
set(FILE_NAME "shader${RAW_FILE_NAME}")
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
)
else()
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
)
endif()
endforeach()
endfunction()
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
message(STATUS "Kompute found")
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
add_subdirectory(kompute)
# Compile our shaders
compile_shader(SOURCES
kompute-shaders/op_scale.comp
kompute-shaders/op_scale_8.comp
kompute-shaders/op_add.comp
kompute-shaders/op_addrow.comp
kompute-shaders/op_mul.comp
kompute-shaders/op_silu.comp
kompute-shaders/op_relu.comp
kompute-shaders/op_gelu.comp
kompute-shaders/op_softmax.comp
kompute-shaders/op_norm.comp
kompute-shaders/op_rmsnorm.comp
kompute-shaders/op_diagmask.comp
kompute-shaders/op_mul_mat_mat_f32.comp
kompute-shaders/op_mul_mat_f16.comp
kompute-shaders/op_mul_mat_q8_0.comp
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q4_k.comp
kompute-shaders/op_mul_mat_q6_k.comp
kompute-shaders/op_getrows_f32.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
kompute-shaders/op_getrows_q6_k.comp
kompute-shaders/op_rope_f16.comp
kompute-shaders/op_rope_f32.comp
kompute-shaders/op_cpy_f16_f16.comp
kompute-shaders/op_cpy_f16_f32.comp
kompute-shaders/op_cpy_f32_f16.comp
kompute-shaders/op_cpy_f32_f32.comp
)
# Create a custom target for our generated shaders
add_custom_target(generated_shaders DEPENDS
shaderop_scale.h
shaderop_scale_8.h
shaderop_add.h
shaderop_addrow.h
shaderop_mul.h
shaderop_silu.h
shaderop_relu.h
shaderop_gelu.h
shaderop_softmax.h
shaderop_norm.h
shaderop_rmsnorm.h
shaderop_diagmask.h
shaderop_mul_mat_mat_f32.h
shaderop_mul_mat_f16.h
shaderop_mul_mat_q8_0.h
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q4_k.h
shaderop_mul_mat_q6_k.h
shaderop_getrows_f32.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
shaderop_getrows_q6_k.h
shaderop_rope_f16.h
shaderop_rope_f32.h
shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h
shaderop_cpy_f32_f32.h
)
# Create a custom command that depends on the generated_shaders
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
DEPENDS generated_shaders
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
)
# Add the stamp to the main sources to ensure dependency tracking
target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
else()
message(WARNING "Kompute not found")
endif()

Some files were not shown because too many files have changed in this diff Show More