Merge branch 'master' into gg/flash-attn

This commit is contained in:
Georgi Gerganov 2024-02-12 21:16:58 +02:00
commit 6875997fd6
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
84 changed files with 7764 additions and 14303 deletions

View File

@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y git apt-get install -y git
@ -10,16 +10,18 @@ WORKDIR /app
COPY . . COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \ RUN mkdir build && \
cd build && \ cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cmake --build . --config Release --target main server echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main
FROM ubuntu:$UBUNTU_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main COPY --from=build /app/build/bin/main /main
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up
WORKDIR /
RUN cp /app/build/bin/main /main && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@ -13,18 +13,22 @@
cudaPackages, cudaPackages,
darwin, darwin,
rocmPackages, rocmPackages,
vulkan-headers,
vulkan-loader,
clblast, clblast,
useBlas ? builtins.all (x: !x) [ useBlas ? builtins.all (x: !x) [
useCuda useCuda
useMetalKit useMetalKit
useOpenCL useOpenCL
useRocm useRocm
useVulkan
], ],
useCuda ? config.cudaSupport, useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMpi ? false, # Increases the runtime closure size by ~700M useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false, useOpenCL ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
}@inputs: }@inputs:
@ -48,7 +52,8 @@ let
++ lib.optionals useMetalKit [ "MetalKit" ] ++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ] ++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ] ++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]; ++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];
pnameSuffix = pnameSuffix =
strings.optionalString (suffices != [ ]) strings.optionalString (suffices != [ ])
@ -108,6 +113,11 @@ let
hipblas hipblas
rocblas rocblas
]; ];
vulkanBuildInputs = [
vulkan-headers
vulkan-loader
];
in in
effectiveStdenv.mkDerivation ( effectiveStdenv.mkDerivation (
@ -164,7 +174,8 @@ effectiveStdenv.mkDerivation (
++ optionals useCuda cudaBuildInputs ++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ] ++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ] ++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs; ++ optionals useRocm rocmBuildInputs
++ optionals useVulkan vulkanBuildInputs;
cmakeFlags = cmakeFlags =
[ [
@ -178,6 +189,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi) (cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
] ]
++ optionals useCuda [ ++ optionals useCuda [
( (
@ -218,6 +230,7 @@ effectiveStdenv.mkDerivation (
useMpi useMpi
useOpenCL useOpenCL
useRocm useRocm
useVulkan
; ;
shell = mkShell { shell = mkShell {
@ -242,11 +255,11 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the # Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because # "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway. # cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be # Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`. # overridden by importing Nixpkgs with `allowBroken = true`.
broken = (useMetalKit && !effectiveStdenv.isDarwin); broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
homepage = "https://github.com/ggerganov/llama.cpp/"; homepage = "https://github.com/ggerganov/llama.cpp/";

View File

@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y git apt-get install -y git
@ -10,13 +10,16 @@ WORKDIR /app
COPY . . COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \ RUN mkdir build && \
cd build && \ cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cmake --build . --config Release --target main server echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server
FROM ubuntu:$UBUNTU_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/server /server COPY --from=build /app/build/bin/server /server

View File

@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target server
# Clean up
WORKDIR /
RUN cp /app/build/bin/server /server && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View File

@ -1,2 +1,3 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = W503

View File

@ -184,6 +184,47 @@ jobs:
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-sycl-fp16:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v2
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
cmake --build . --config Release -j $(nproc)
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

View File

@ -16,5 +16,5 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py" exclude: "examples/*,examples/*/**,*/**/__init__.py"

View File

@ -79,7 +79,7 @@ if (NOT MSVC)
endif() endif()
if (WIN32) if (WIN32)
option(LLAMA_WIN_VER "llama: Windows Version" 0x602) set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
endif() endif()
# 3rd party libs # 3rd party libs
@ -100,6 +100,10 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF) option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
@ -431,6 +435,22 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN) add_compile_definitions(GGML_USE_VULKAN)
if (LLAMA_VULKAN_CHECK_RESULTS)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS)
endif()
if (LLAMA_VULKAN_DEBUG)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG)
endif()
if (LLAMA_VULKAN_VALIDATE)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE)
endif()
if (LLAMA_VULKAN_RUN_TESTS)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
else() else()
message(WARNING "Vulkan not found") message(WARNING "Vulkan not found")
@ -789,9 +809,9 @@ if (LLAMA_CCACHE)
if (LLAMA_CCACHE_FOUND) if (LLAMA_CCACHE_FOUND)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
set(ENV{CCACHE_SLOPPINESS} time_macros) set(ENV{CCACHE_SLOPPINESS} time_macros)
message(STATUS "Using ccache") message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.")
else() else()
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF") message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF")
endif () endif ()
endif() endif()
@ -830,7 +850,9 @@ endif()
set(ARCH_FLAGS "") set(ARCH_FLAGS "")
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
add_compile_definitions(__ARM_NEON) add_compile_definitions(__ARM_NEON)
@ -856,7 +878,9 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
list(APPEND ARCH_FLAGS -mno-unaligned-access) list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
# instruction set detection for MSVC only # instruction set detection for MSVC only

187
Makefile
View File

@ -109,8 +109,21 @@ MK_NVCCFLAGS += -O3
else else
MK_CFLAGS += -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3 MK_CXXFLAGS += -O3
MK_NVCCFLAGS += -O3
endif endif
ifndef LLAMA_NO_CCACHE
CCACHE := $(shell which ccache)
ifdef CCACHE
export CCACHE_SLOPPINESS = time_macros
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
CC := $(CCACHE) $(CC)
CXX := $(CCACHE) $(CXX)
else
$(info I ccache not found. Consider installing it for faster compilation.)
endif # CCACHE
endif # LLAMA_NO_CCACHE
# clock_gettime came in POSIX.1b (1993) # clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3 # posix_memalign came in POSIX.1-2001 / SUSv3
@ -365,7 +378,7 @@ ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
MK_NVCCFLAGS = -use_fast_math MK_NVCCFLAGS += -use_fast_math
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
@ -373,9 +386,9 @@ ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC) NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else else
NVCC = nvcc NVCC = $(CCACHE) nvcc
endif #LLAMA_CUDA_NVCC endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@ -457,6 +470,18 @@ ifdef LLAMA_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
endif endif
ifdef LLAMA_VULKAN_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
endif
ifdef LLAMA_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif
ifdef LLAMA_VULKAN_RUN_TESTS
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
endif
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_VULKAN endif # LLAMA_VULKAN
@ -470,7 +495,7 @@ ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif endif
HIPCC ?= $(ROCM_PATH)/bin/hipcc HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2
@ -542,6 +567,9 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
endif # LLAMA_CUBLAS
$(info ) $(info )
# #
@ -591,97 +619,135 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean: clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
find examples pocs -type f -name "*.o" -delete
# #
# Examples # Examples
# #
# $< is the first prerequisite, i.e. the source file.
# Explicitly compile this to an object file so that it can be cached with ccache.
# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead.
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
@ -689,7 +755,7 @@ swift: examples/batched.swift
endif endif
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh $(CC) > $@.tmp @sh scripts/build-info.sh "$(CC)" > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
else \ else \
@ -706,7 +772,8 @@ build-info.o: common/build-info.cpp
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult run-benchmark-matmult: benchmark-matmult
./$@ ./$@
@ -714,58 +781,76 @@ run-benchmark-matmult: benchmark-matmult
.PHONY: run-benchmark-matmult swift .PHONY: run-benchmark-matmult swift
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@ -13,17 +13,31 @@ let package = Package(
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [ targets: [
.target( .target(
name: "llama", name: "llama",
dependencies: ["ggml"],
path: ".", path: ".",
exclude: ["ggml-metal.metal"], exclude: [
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: [ sources: [
"ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
], ],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [

View File

@ -1,22 +1,15 @@
# llama.cpp for SYCL # llama.cpp for SYCL
[Background](#background) - [Background](#background)
- [OS](#os)
[OS](#os) - [Intel GPU](#intel-gpu)
- [Docker](#docker)
[Intel GPU](#intel-gpu) - [Linux](#linux)
- [Windows](#windows)
[Linux](#linux) - [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
[Windows](#windows) - [Q&A](#q&a)
- [Todo](#todo)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Q&A](#q&a)
[Todo](#todo)
## Background ## Background
@ -36,20 +29,65 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|OS|Status|Verified| |OS|Status|Verified|
|-|-|-| |-|-|-|
|Linux|Support|Ubuntu 22.04| |Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11| |Windows|Support|Windows 11|
## Intel GPU ## Intel GPU
### Verified
|Intel GPU| Status | Verified Model| |Intel GPU| Status | Verified Model|
|-|-|-| |-|-|-|
|Intel Data Center Max Series| Support| Max 1550| |Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170| |Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M| |Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
### Memory
The memory is a limitation to run LLM on GPUs.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
## Docker
Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
```sh
# For F16:
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
```
### Run
```sh
# Firstly, find all the DRI cards:
ls -la /dev/dri
# Then, pick the card that you want to use.
# For example with "/dev/dri/card1"
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
## Linux ## Linux
@ -63,7 +101,7 @@ Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render. b. Add user to group: video, render.
``` ```sh
sudo usermod -aG render username sudo usermod -aG render username
sudo usermod -aG video username sudo usermod -aG video username
``` ```
@ -72,7 +110,7 @@ Note: re-login to enable it.
c. Check c. Check
``` ```sh
sudo apt install clinfo sudo apt install clinfo
sudo clinfo -l sudo clinfo -l
``` ```
@ -90,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
2. Install Intel® oneAPI Base toolkit. 2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**. Recommend to install to default folder: **/opt/intel/oneapi**.
@ -99,13 +136,13 @@ Following guide use the default folder as example. If you use other folder, plea
b. Check b. Check
``` ```sh
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
sycl-ls sycl-ls
``` ```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example): Output (example):
``` ```
@ -118,21 +155,25 @@ Output (example):
2. Build locally: 2. Build locally:
``` Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
```sh
mkdir -p build mkdir -p build
cd build cd build
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
#for FP16 # For FP16:
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#for FP32 # Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only # Build example/main only
#cmake --build . --config Release --target main #cmake --build . --config Release --target main
#build all binary # Or, build all binary
cmake --build . --config Release -v cmake --build . --config Release -v
cd .. cd ..
@ -140,18 +181,16 @@ cd ..
or or
``` ```sh
./examples/sycl/build.sh ./examples/sycl/build.sh
``` ```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run ### Run
1. Put model file to folder **models** 1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment 2. Enable oneAPI running environment
``` ```
@ -162,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh
Run without parameter: Run without parameter:
``` ```sh
./build/bin/ls-sycl-device ./build/bin/ls-sycl-device
or # or running the "main" executable and look at the output log:
./build/bin/main ./build/bin/main
``` ```
@ -194,13 +233,13 @@ found 4 SYCL devices:
Set device ID = 0 by **GGML_SYCL_DEVICE=0** Set device ID = 0 by **GGML_SYCL_DEVICE=0**
``` ```sh
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
``` ```
or run by script: or run by script:
``` ```sh
./examples/sycl/run-llama2.sh ./examples/sycl/run_llama2.sh
``` ```
Note: Note:
@ -223,7 +262,13 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Intel® oneAPI Base toolkit. Note: **The driver is mandatory for compute function**.
2. Install Visual Studio.
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
3. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
@ -252,7 +297,7 @@ In oneAPI command line:
sycl-ls sycl-ls
``` ```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example): Output (example):
``` ```
@ -260,15 +305,19 @@ Output (example):
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] [opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186] [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
``` ```
3. Install cmake & make 4. Install cmake & make
a. Download & install cmake for windows: https://cmake.org/download/ a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/ b. Download & install mingw-w64 make for Windows provided by w64devkit
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
### Build locally: ### Build locally:
@ -309,6 +358,8 @@ Note:
1. Put model file to folder **models** 1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment 2. Enable oneAPI running environment
- In Search, input 'oneAPI'. - In Search, input 'oneAPI'.
@ -405,7 +456,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**. Solution: add **--no-mmap** or **--mmap 0**.
## Q&A ## Q&A
@ -419,8 +470,25 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Miss to enable oneAPI running environment. Miss to enable oneAPI running environment.
- Meet compile error.
Remove folder **build** and try again.
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Then **relogin**.
If you do not see it, please check the installation GPU steps again.
## Todo ## Todo
- Support to build in Windows.
- Support multiple cards. - Support multiple cards.

263
README.md
View File

@ -6,7 +6,7 @@
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
### Hot topics ### Hot topics
@ -33,17 +33,14 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
<li><a href="#get-the-code">Get the Code</a></li> <li><a href="#get-the-code">Get the Code</a></li>
<li><a href="#build">Build</a></li> <li><a href="#build">Build</a></li>
<li><a href="#blas-build">BLAS Build</a></li> <li><a href="#blas-build">BLAS Build</a></li>
<li><a href="#prepare-data--run">Prepare Data & Run</a></li> <li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
<li><a href="#run-the-quantized-model">Run the quantized model</a></li>
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li> <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
<li><a href="#quantization">Quantization</a></li> <li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li> <li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li> <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li> <li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#using-openllama">Using OpenLLaMA</a></li> <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#using-gpt4all">Using GPT4All</a></li>
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
<li><a href="#verifying-the-model-files">Verifying the model files</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li> <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li> <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
<li><a href="#android">Android</a></li> <li><a href="#android">Android</a></li>
@ -58,18 +55,20 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
## Description ## Description
The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without dependencies - Plain C/C++ implementation without any dependencies
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- CUDA, Metal, OpenCL, SYCL GPU backend support - Vulkan, SYCL, and (partial) OpenCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves improved significantly thanks to many contributions. It is the main playground for developing new features for the
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. [ggml](https://github.com/ggerganov/ggml) library.
**Supported platforms:** **Supported platforms:**
@ -77,44 +76,46 @@ as the main playground for developing new features for the [ggml](https://github
- [X] Linux - [X] Linux
- [X] Windows (via CMake) - [X] Windows (via CMake)
- [X] Docker - [X] Docker
- [X] FreeBSD
**Supported models:** **Supported models:**
Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙 - [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙 - [x] LLaMA 2 🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [X] Falcon - [X] Falcon
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410) - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586) - [X] [StableLM models](https://huggingface.co/stabilityai)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [GPT-2](https://huggingface.co/gpt2) - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
**Multimodal models:** **Multimodal models:**
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
**Bindings:** **Bindings:**
@ -123,6 +124,7 @@ as the main playground for developing new features for the [ggml](https://github
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@ -136,19 +138,30 @@ as the main playground for developing new features for the [ggml](https://github
**UI:** **UI:**
- [nat/openplayground](https://github.com/nat/openplayground) Unless otherwise noted these projects are open-source with permissive licensing:
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai)
- [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA) - [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
--- ---
Here is a typical run using LLaMA v2 13B on M2 Ultra: Here is a typical run using LLaMA v2 13B on M2 Ultra:
```java ```
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e $ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info: I llama.cpp build info:
I UNAME_S: Darwin I UNAME_S: Darwin
@ -232,7 +245,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage ## Usage
Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model. Here are the end-to-end binary build and model conversion steps for most supported models.
### Get the Code ### Get the Code
@ -393,28 +406,28 @@ Building the program with BLAS support may lead to some performance improvements
Check [BLIS.md](docs/BLIS.md) for more information. Check [BLIS.md](docs/BLIS.md) for more information.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
- #### Intel oneMKL - #### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
- Using manual oneAPI installation: - Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash ```bash
mkdir build mkdir build
cd build cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
- Using oneAPI docker image: - Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime) If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
@ -601,43 +614,87 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc. You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### SYCL - #### Vulkan
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. **With docker**:
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). You don't need to install Vulkan SDK. It will be installed inside the container.
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md). ```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
### Prepare Data & Run **Without docker**:
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
For example, on Ubuntu 22.04 (jammy), use the command below:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
Then, build llama.cpp using the cmake command below:
```bash
mkdir -p build
cd build
cmake .. -DLLAMA_VULKAN=1
cmake --build . --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### Prepare and Quantize
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
```bash ```bash
# obtain the original LLaMA model weights and place them in ./models # obtain the official LLaMA model weights and place them in ./models
ls ./models ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model llama-2-7b tokenizer_checklist.chk tokenizer.model
# [Optional] for models using BPE tokenizers # [Optional] for models using BPE tokenizers
ls ./models ls ./models
65B 30B 13B 7B vocab.json <folder containing weights and tokenizer json> vocab.json
# [Optional] for PyTorch .bin models like Mistral-7B
ls ./models
<folder containing weights and tokenizer json>
# install Python dependencies # install Python dependencies
python3 -m pip install -r requirements.txt python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format # convert the model to ggml FP16 format
python3 convert.py models/7B/ python3 convert.py models/mymodel/
# [Optional] for models using BPE tokenizers # [Optional] for models using BPE tokenizers
python convert.py models/7B/ --vocabtype bpe python convert.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using q4_0 method) # quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current if older version is unsupported by another application # update the gguf filetype to current version if older version is now unsupported
./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY ./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
```
### Run the quantized model
# run the inference ```bash
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 # start inference on a gguf model
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -658,7 +715,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (4-bit) | | Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|-----------------------:| |------:|--------------:|-----------------------:|
| 7B | 13 GB | 3.9 GB | | 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB | | 13B | 24 GB | 7.8 GB |
@ -685,9 +742,21 @@ Several quantization methods are supported. They differ in the resulting model d
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements - recent k-quants improvements and new i-quants
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
- [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
- [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
- [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
- [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
### Perplexity (measuring model quality) ### Perplexity (measuring model quality)
@ -762,9 +831,9 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruction mode with Alpaca ### Instruct mode
1. First, download the `ggml` Alpaca model into the `./models` folder 1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this: 2. Run the `main` tool like this:
``` ```
@ -790,50 +859,6 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
> >
``` ```
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
*Note: these instructions are likely obsoleted by the GGUF update*
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
- It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using `convert.py`:
```bash
python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
```
- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
- The newer GPT4All-J model is not yet supported!
### Using Pygmalion 7B & Metharme 7B
- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data)
- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights
- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)
- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script
- Convert to `ggml` format using the `convert.py` script in this repo:
```bash
python3 convert.py pygmalion-7b/ --outtype q4_1
```
> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`.
### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
### Obtaining and using the Facebook LLaMA 2 model ### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
@ -845,20 +870,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF) - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
### Verifying the model files
Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
- The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory:
```bash
# run the verification script
./scripts/verify-checksum-models.py
```
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
- On Linux: `sha256sum --ignore-missing -c SHA256SUMS`
- on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS`
### Seminal papers and background on the models ### Seminal papers and background on the models
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:

View File

@ -1,40 +0,0 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth
73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth
882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth
a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model

View File

@ -46,6 +46,10 @@
#define GGML_USE_CUBLAS_SYCL #define GGML_USE_CUBLAS_SYCL
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
#define GGML_USE_CUBLAS_SYCL_VULKAN
#endif
int32_t get_num_physical_cores() { int32_t get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
@ -336,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.samplers_sequence = parse_samplers_input(argv[i]); const auto sampler_names = string_split(argv[i], ';');
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
} else if (arg == "--sampling-seq") { } else if (arg == "--sampling-seq") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.samplers_sequence = argv[i]; sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -399,6 +404,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
sparams.penalty_present = std::stof(argv[i]); sparams.penalty_present = std::stof(argv[i]);
} else if (arg == "--dynatemp-range") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.dynatemp_range = std::stof(argv[i]);
} else if (arg == "--dynatemp-exp") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.dynatemp_exponent = std::stof(argv[i]);
} else if (arg == "--mirostat") { } else if (arg == "--mirostat") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -515,7 +532,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--lora-scaled") { } else if (arg == "--lora-scaled") {
if (++i >= argc) { if (++i >= argc) {
@ -527,7 +544,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--lora-base") { } else if (arg == "--lora-base") {
if (++i >= argc) { if (++i >= argc) {
@ -648,8 +665,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS_SYCL #ifndef GGML_USE_CUBLAS_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
@ -664,7 +681,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.antiprompt.push_back(argv[i]); params.antiprompt.emplace_back(argv[i]);
} else if (arg == "-ld" || arg == "--logdir") { } else if (arg == "-ld" || arg == "--logdir") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -880,7 +897,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} }
if (!params.kv_overrides.empty()) { if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(llama_model_kv_override()); params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;
} }
@ -890,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams; const llama_sampling_params & sparams = params.sparams;
std::string sampler_type_chars;
std::string sampler_type_names;
for (const auto sampler_type : sparams.samplers_sequence) {
sampler_type_chars += static_cast<char>(sampler_type);
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
}
sampler_type_names.pop_back();
printf("\n"); printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
printf("\n"); printf("\n");
@ -931,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -942,6 +967,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
printf(" --mirostat N use Mirostat sampling.\n"); printf(" --mirostat N use Mirostat sampling.\n");
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@ -1079,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
} }
// //
// String parsing // String utils
// //
std::string parse_samplers_input(std::string input) { std::vector<std::string> string_split(std::string input, char separator) {
std::string output = ""; std::vector<std::string> parts;
size_t separator_pos = input.find(separator);
while (separator_pos != std::string::npos) {
std::string part = input.substr(0, separator_pos);
parts.emplace_back(part);
input = input.substr(separator_pos + 1);
separator_pos = input.find(separator);
}
parts.emplace_back(input);
return parts;
}
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
// since samplers names are written multiple ways // since samplers names are written multiple ways
// make it ready for both system names and input names // make it ready for both system names and input names
std::unordered_map<std::string, char> samplers_symbols { std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
{"top_k", 'k'}, {"top_k", llama_sampler_type::TOP_K},
{"top-k", 'k'}, {"top-k", llama_sampler_type::TOP_K},
{"top_p", 'p'}, {"top_p", llama_sampler_type::TOP_P},
{"top-p", 'p'}, {"top-p", llama_sampler_type::TOP_P},
{"nucleus", 'p'}, {"nucleus", llama_sampler_type::TOP_P},
{"typical_p", 'y'}, {"typical_p", llama_sampler_type::TYPICAL_P},
{"typical-p", 'y'}, {"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", 'y'}, {"typical", llama_sampler_type::TYPICAL_P},
{"min_p", 'm'}, {"min_p", llama_sampler_type::MIN_P},
{"min-p", 'm'}, {"min-p", llama_sampler_type::MIN_P},
{"tfs_z", 'f'}, {"tfs_z", llama_sampler_type::TFS_Z},
{"tfs-z", 'f'}, {"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", 'f'}, {"tfs", llama_sampler_type::TFS_Z},
{"temp", 't'}, {"temp", llama_sampler_type::TEMP},
{"temperature",'t'} {"temperature", llama_sampler_type::TEMP}
}; };
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
size_t separator = input.find(';');
while (separator != input.npos) {
std::string name = input.substr(0,separator);
input = input.substr(separator+1);
separator = input.find(';');
if (samplers_symbols.find(name) != samplers_symbols.end()) { std::vector<llama_sampler_type> sampler_types;
output += samplers_symbols[name]; sampler_types.reserve(names.size());
for (const auto& name : names) {
const auto sampler_item = sampler_name_map.find(name);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
} }
} }
if (samplers_symbols.find(input) != samplers_symbols.end()) { return sampler_types;
output += samplers_symbols[input]; }
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMP}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMP: return "temp";
default : return "";
} }
return output;
} }
// //
@ -1532,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
#ifdef NDEBUG #ifdef NDEBUG
fprintf(stream, "debug: false\n"); fprintf(stream, "debug: false\n");

View File

@ -75,8 +75,7 @@ struct gpt_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
// pinging @cebtenzzre
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
@ -163,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input); void process_escapes(std::string& input);
// //
// String parsing // String utils
// //
std::string parse_samplers_input(std::string input); std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
// //
// Model utils // Model utils

View File

@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties "; std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (auto s : params.samplers_sequence) { for (auto sampler_type : params.samplers_sequence) {
switch (s) { const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
case 'k': result += "-> top_k "; break; if (!sampler_type_name.empty()) {
case 'f': result += "-> tfs_z "; break; result += "-> " + sampler_type_name + " ";
case 'y': result += "-> typical_p "; break;
case 'p': result += "-> top_p "; break;
case 'm': result += "-> min_p "; break;
case 't': result += "-> temp "; break;
default : break;
} }
} }
} else { } else {
@ -127,26 +122,24 @@ static void sampler_queue(
const llama_sampling_params & params, const llama_sampling_params & params,
llama_token_data_array & cur_p, llama_token_data_array & cur_p,
size_t & min_keep) { size_t & min_keep) {
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp; const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range; const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent; const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const int32_t top_k = params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
const float min_p = params.min_p; const float min_p = params.min_p;
const float tfs_z = params.tfs_z; const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p; const float typical_p = params.typical_p;
const std::string & samplers_sequence = params.samplers_sequence; const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
for (auto s : samplers_sequence) { for (auto sampler_type : samplers_sequence) {
switch (s){ switch (sampler_type) {
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': case llama_sampler_type::TEMP:
if (dynatemp_range > 0) { if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range);

View File

@ -8,6 +8,16 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMP = 't'
};
// sampling parameters // sampling parameters
typedef struct llama_sampling_params { typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
@ -28,7 +38,15 @@ typedef struct llama_sampling_params {
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMP
};
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling

View File

@ -22,6 +22,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
from convert import HfVocab
# check for any of the given keys in the dictionary and return the value of the first key found # check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys): def get_key_opts(d, keys):
@ -205,6 +207,10 @@ class Model:
return OrionModel return OrionModel
if model_architecture == "InternLM2ForCausalLM": if model_architecture == "InternLM2ForCausalLM":
return InternLM2Model return InternLM2Model
if model_architecture == "MiniCPMForCausalLM":
return MiniCPMModel
if model_architecture == "BertModel":
return BertModel
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -258,6 +264,10 @@ class Model:
return gguf.MODEL_ARCH.ORION return gguf.MODEL_ARCH.ORION
if arch == "InternLM2ForCausalLM": if arch == "InternLM2ForCausalLM":
return gguf.MODEL_ARCH.INTERNLM2 return gguf.MODEL_ARCH.INTERNLM2
if arch == "MiniCPMForCausalLM":
return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel":
return gguf.MODEL_ARCH.BERT
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -402,6 +412,31 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_hf(self):
path = self.dir_model
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
class GPTNeoXModel(Model): class GPTNeoXModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -1041,6 +1076,83 @@ class MixtralModel(Model):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
class MiniCPMModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name("MiniCPM")
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self):
self._set_vocab_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_head = self.hparams.get("num_attention_heads")
n_kv_head = self.hparams.get("num_key_value_heads")
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
# HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
class QwenModel(Model): class QwenModel(Model):
@staticmethod @staticmethod
def token_bytes_to_string(b): def token_bytes_to_string(b):
@ -1138,7 +1250,7 @@ class GPT2Model(Model):
for name, data_torch in self.get_tensors(): for name, data_torch in self.get_tensors():
# we don't need these # we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")): if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
continue continue
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
@ -1416,8 +1528,32 @@ class InternLM2Model(Model):
self.gguf_writer.add_add_space_prefix(add_prefix) self.gguf_writer.add_add_space_prefix(add_prefix)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
old_eos = special_vocab.special_token_ids["eos"]
if "chat" in os.path.basename(self.dir_model.absolute()):
# For the chat model, we replace the eos with '<|im_end|>'.
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
in chat mode so that the conversation can end normally.")
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _try_get_sft_eos(self, tokenizer):
unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
im_end_list = tokenizer.encode('<|im_end|>')
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
if len(unused_145_list) == 1:
eos_token = unused_145_list[0]
if len(im_end_list) == 1:
eos_token = im_end_list[0]
return eos_token
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_name("InternLM2")
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
@ -1486,8 +1622,9 @@ class InternLM2Model(Model):
qkv = data_torch qkv = data_torch
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
q = rearrange(q, " o g n i -> o (g n i)").T # The model weights of q and k equire additional reshape.
k = rearrange(k, " o g n i -> o (g n i)").T q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
v = rearrange(v, " o g n i -> o (g n i)").T v = rearrange(v, " o g n i -> o (g n i)").T
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
@ -1496,6 +1633,96 @@ class InternLM2Model(Model):
self.post_write_tensors(tensor_map, name, data_torch) self.post_write_tensors(tensor_map, name, data_torch)
class BertModel(Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.block_count = self.hparams["num_hidden_layers"]
def set_gguf_parameters(self):
# TODO(cebtenzzre): merge with parent class
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None
# use huggingface vocab to get all tokens
vocab = HfVocab(path, added_tokens_path)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
# we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings
n_token_types = len(set(toktypes))
self.gguf_writer.add_token_type_count(n_token_types)
# convert to phantom space vocab
def phantom(tok, typ):
if tok.startswith(b"[") and tok.endswith(b"]"):
return tok
if tok.startswith(b"##"):
return tok[2:]
return b"\xe2\x96\x81" + tok
tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)]
# set up bos and eos tokens (cls and sep)
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
# handle special tokens
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def write_tensors(self):
tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
tensors = dict(self.get_tensors())
for name, data_torch in tensors.items():
# we are only using BERT for embeddings so we don't need the pooling layer
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
continue # we don't need these
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
data = data_torch.squeeze().numpy()
n_dims = len(data.shape)
new_dtype: type[np.floating[Any]]
if (
self.ftype == 1 and name.endswith(".weight") and n_dims == 2
and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
):
# if f16 desired, convert any float32 2-dim weight tensors to float16
new_dtype = np.float16
else:
# if f32 desired, convert any float16 to float32
new_dtype = np.float32
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
if data.dtype != new_dtype:
data = data.astype(new_dtype)
self.gguf_writer.add_tensor(new_name, data)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View File

@ -88,7 +88,8 @@ def main():
gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
gguf_writer.add_rope_dimension_count(hidden_size // head_count) # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count) gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)

View File

@ -334,9 +334,9 @@ class Params:
class BpeVocab: class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
try: if isinstance(self.bpe_tokenizer.get('model'), dict):
self.vocab = self.bpe_tokenizer["model"]["vocab"] self.vocab = self.bpe_tokenizer["model"]["vocab"]
except KeyError: else:
self.vocab = self.bpe_tokenizer self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
@ -515,10 +515,14 @@ class HfVocab:
# Yield token text, score, and type # Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type( yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, self.special_ids # Reuse already stored special IDs token_id, token_text, self.special_ids # Reuse already stored special IDs
) )
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE
# Determine token type based on whether it's a special token # Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
@ -530,7 +534,7 @@ class HfVocab:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids) toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text]) score = self.get_token_score(self.specials[text])
else: else:
toktype = gguf.TokenType.USER_DEFINED toktype = gguf.TokenType.USER_DEFINED

View File

@ -87,7 +87,17 @@ int main(int argc, char ** argv) {
} }
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
const auto * embeddings = llama_get_embeddings(ctx); auto * embeddings = llama_get_embeddings(ctx);
// l2-normalize embeddings
float norm = 0;
for (int i = 0; i < n_embd; i++) {
norm += embeddings[i] * embeddings[i];
}
norm = sqrt(norm);
for (int i = 0; i < n_embd; i++) {
embeddings[i] /= norm;
}
for (int i = 0; i < n_embd; i++) { for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]); printf("%f ", embeddings[i]);

View File

@ -337,24 +337,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
params.mem_buffer = NULL; params.mem_buffer = NULL;
params.no_alloc = true; params.no_alloc = true;
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
struct ggml_allocr * alloc = NULL; struct ggml_gallocr * alloc = NULL;
struct ggml_cgraph * gf = NULL; struct ggml_cgraph * gf = NULL;
ctx = ggml_init(params); ctx = ggml_init(params);
alloc = ggml_allocr_new_measure(tensor_alignment); alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
ggml_allocr_free(alloc);
ggml_free(ctx);
static std::vector<uint8_t> data_compute; ggml_gallocr_alloc_graph(alloc, gf);
data_compute.resize(alloc_size + tensor_alignment);
ctx = ggml_init(params);
alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
ggml_allocr_alloc_graph(alloc, gf);
ggml_allocr_free(alloc);
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
static std::vector<uint8_t> data_work; static std::vector<uint8_t> data_work;
@ -363,6 +353,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
ggml_graph_compute(gf, &cplan); ggml_graph_compute(gf, &cplan);
ggml_gallocr_free(alloc);
ggml_free(ctx); ggml_free(ctx);
return true; return true;
} }

View File

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "train.h" #include "train.h"
@ -13,8 +14,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const size_t tensor_alignment = 32;
struct my_llama_hparams { struct my_llama_hparams {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; uint32_t n_ctx = 512;
@ -128,7 +127,7 @@ struct my_llama_lora_layer {
struct my_llama_lora { struct my_llama_lora {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
std::vector<uint8_t> data; ggml_backend_buffer_t data;
my_llama_lora_hparams hparams; my_llama_lora_hparams hparams;
@ -372,63 +371,6 @@ static void set_param_lora(struct my_llama_lora * lora) {
} }
} }
static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
ggml_allocr_alloc(alloc, lora->norm_a);
ggml_allocr_alloc(alloc, lora->norm_b);
ggml_allocr_alloc(alloc, lora->output_a);
ggml_allocr_alloc(alloc, lora->output_b);
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
auto & layer = lora->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm_a);
ggml_allocr_alloc(alloc, layer.attention_norm_b);
ggml_allocr_alloc(alloc, layer.wq_a);
ggml_allocr_alloc(alloc, layer.wq_b);
ggml_allocr_alloc(alloc, layer.wk_a);
ggml_allocr_alloc(alloc, layer.wk_b);
ggml_allocr_alloc(alloc, layer.wv_a);
ggml_allocr_alloc(alloc, layer.wv_b);
ggml_allocr_alloc(alloc, layer.wo_a);
ggml_allocr_alloc(alloc, layer.wo_b);
ggml_allocr_alloc(alloc, layer.ffn_norm_a);
ggml_allocr_alloc(alloc, layer.ffn_norm_b);
ggml_allocr_alloc(alloc, layer.w1_a);
ggml_allocr_alloc(alloc, layer.w1_b);
ggml_allocr_alloc(alloc, layer.w2_a);
ggml_allocr_alloc(alloc, layer.w2_b);
ggml_allocr_alloc(alloc, layer.w3_a);
ggml_allocr_alloc(alloc, layer.w3_b);
}
ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
ggml_allocr_alloc(alloc, lora->norm_a->grad);
ggml_allocr_alloc(alloc, lora->norm_b->grad);
ggml_allocr_alloc(alloc, lora->output_a->grad);
ggml_allocr_alloc(alloc, lora->output_b->grad);
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
auto & layer = lora->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
ggml_allocr_alloc(alloc, layer.wq_a->grad);
ggml_allocr_alloc(alloc, layer.wq_b->grad);
ggml_allocr_alloc(alloc, layer.wk_a->grad);
ggml_allocr_alloc(alloc, layer.wk_b->grad);
ggml_allocr_alloc(alloc, layer.wv_a->grad);
ggml_allocr_alloc(alloc, layer.wv_b->grad);
ggml_allocr_alloc(alloc, layer.wo_a->grad);
ggml_allocr_alloc(alloc, layer.wo_b->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
ggml_allocr_alloc(alloc, layer.w1_a->grad);
ggml_allocr_alloc(alloc, layer.w1_b->grad);
ggml_allocr_alloc(alloc, layer.w2_a->grad);
ggml_allocr_alloc(alloc, layer.w2_b->grad);
ggml_allocr_alloc(alloc, layer.w3_a->grad);
ggml_allocr_alloc(alloc, layer.w3_b->grad);
}
}
static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
const auto & lparams = lora->hparams; const auto & lparams = lora->hparams;
@ -522,18 +464,8 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
set_param_lora(lora); set_param_lora(lora);
// measure data size // allocate data for lora tensors
size_t size = 0; lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data
struct ggml_allocr * alloc = NULL;
lora->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
alloc_lora(alloc, lora);
ggml_allocr_free(alloc);
} }
static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
static struct ggml_tensor * llama_build_lora_finetune_graphs( static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct my_llama_model * model, struct my_llama_model * model,
struct my_llama_lora * lora, struct my_llama_lora * lora,
struct ggml_allocr * alloc, ggml_gallocr_t alloc,
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_cgraph * gb, struct ggml_cgraph * gb,
@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int n_tokens, const int n_tokens,
const int n_batch, const int n_batch,
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing,
const bool measure_only) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
ggml_allocr_alloc(alloc, KQ_pos); ggml_set_input(KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// input gradient // input gradient
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad); ggml_set_input(t36->grad);
// KQ_pos // KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
@ -805,11 +732,23 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// note: they will be freed in reverse order // note: they will be freed in reverse order
for (unsigned int i = 0; i < checkpoints.size(); ++i) { for (unsigned int i = 0; i < checkpoints.size(); ++i) {
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
ggml_allocr_alloc(alloc, checkpoints[i]); ggml_set_input(checkpoints[i]);
} }
} }
ggml_allocr_alloc_graph(alloc, gb); if (measure_only) {
ggml_gallocr_reserve(alloc, gb);
} else {
ggml_gallocr_alloc_graph(alloc, gb);
// set KQ_pos
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
}
// remove the additional nodes and leafs // remove the additional nodes and leafs
for (int i = n_leafs_before; i < gb->n_leafs; ++i) { for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
if (params.only_write_lora) { if (params.only_write_lora) {
save_train_files_data save_data; save_train_files_data save_data;
@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch; int n_batch = params.common.n_batch;
std::vector<uint8_t> mem_input_data;
std::vector<uint8_t> mem_compute_data;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params = {
ggml_tensor_overhead() * 2, // mem_size ggml_tensor_overhead() * 2, // mem_size
@ -1706,17 +1641,11 @@ int main(int argc, char ** argv) {
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); // measure required memory for input tensors
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
ggml_allocr_alloc(alloc_inps, tokens_input); size_t max_input_size = ggml_backend_buffer_get_size(input_data);
ggml_allocr_alloc(alloc_inps, target_probs); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
true
); );
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
if (max_compute_size < best_compute_size) { if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size; best_compute_size = max_compute_size;
best_order = gf->order; best_order = gf->order;
} }
ggml_allocr_free(alloc); ggml_gallocr_free(alloc);
ggml_free(ctx_compute); ggml_free(ctx_compute);
} }
size_t max_compute_size = best_compute_size; size_t max_compute_size = best_compute_size;
@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
"invalid"); "invalid");
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
false
); );
ggml_allocr_free(alloc);
ggml_allocr_free(alloc_inps);
// tokenize data // tokenize data
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;
@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
ggml_free(ctx_work); ggml_free(ctx_work);
ggml_free(ctx_compute); ggml_free(ctx_compute);
ggml_free(ctx_input); ggml_free(ctx_input);
ggml_gallocr_free(alloc);
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
printf("%s: total training time: ", __func__); printf("%s: total training time: ", __func__);

View File

@ -36,6 +36,8 @@ public:
void set_parameters(StatParams&& params) { m_params = std::move(params); } void set_parameters(StatParams&& params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix() const; void save_imatrix() const;
bool load_imatrix(const char * file_name, bool add);
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
StatParams m_params; StatParams m_params;
@ -189,6 +191,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const {
} }
} }
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
std::ifstream in(imatrix_file, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file);
return false;
}
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {};
return false;
}
e.values.resize(nval);
in.read((char*)e.values.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return false;
}
e.ncall = ncall;
}
return true;
}
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
if (!add) {
m_stats.clear();
}
return load_imatrix(file_name, m_stats);
}
static IMatrixCollector g_collector; static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -269,7 +322,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -282,6 +335,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx); n_ctx);
@ -402,7 +464,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
StatParams sparams; StatParams sparams;
std::string prev_result_file;
std::string combine_files;
bool compute_ppl = true; bool compute_ppl = true;
int from_chunk = 0;
std::vector<char*> args; std::vector<char*> args;
args.push_back(argv[0]); args.push_back(argv[0]);
int iarg = 1; int iarg = 1;
@ -423,6 +488,13 @@ int main(int argc, char ** argv) {
compute_ppl = false; compute_ppl = false;
} else if (arg == "--keep-imatrix") { } else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]); sparams.keep_every = std::stoi(argv[++iarg]);
} else if (arg == "--continue-from") {
prev_result_file = argv[++iarg];
} else if (arg == "--combine") {
combine_files = argv[++iarg];
}
else if (arg == "--from-chunk") {
from_chunk = std::stoi(argv[++iarg]);
} else { } else {
args.push_back(argv[iarg]); args.push_back(argv[iarg]);
} }
@ -436,14 +508,50 @@ int main(int argc, char ** argv) {
} }
} }
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
std::vector<std::string> files;
size_t pos = 0;
while (true) {
auto new_pos = combine_files.find(',', pos);
if (new_pos != std::string::npos) {
files.emplace_back(combine_files.substr(pos, new_pos - pos));
pos = new_pos + 1;
} else {
files.emplace_back(combine_files.substr(pos));
break;
}
}
if (files.size() < 2) {
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
return 1;
}
printf("Combining the following %d files\n", int(files.size()));
for (auto& file : files) {
printf(" %s\n", file.c_str());
if (!g_collector.load_imatrix(file.c_str(), true)) {
fprintf(stderr, "Failed to load %s\n", file.c_str());
return 1;
}
}
g_collector.save_imatrix();
return 0;
}
if (!prev_result_file.empty()) {
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
return 1;
}
}
gpt_params params; gpt_params params;
params.n_batch = 512; params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) { if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1; return 1;
} }
g_collector.set_parameters(std::move(sparams));
params.logits_all = true; params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
@ -495,7 +603,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl); bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
if (!OK) { if (!OK) {
return 1; return 1;
} }

View File

@ -27,12 +27,16 @@ options:
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512) -b, --batch-size <n> (default: 512)
--memory-f32 <0|1> (default: 0) -ctk <t>, --cache-type-k <t> (default: f16)
-t, --threads <n> (default: 16) -ctv <t>, --cache-type-v <t> (default: f16)
-ngl N, --n-gpu-layers <n> (default: 99) -t, --threads <n> (default: 112)
-mg i, --main-gpu <i> (default: 0) -ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
-mmq, --mul-mat-q <0|1> (default: 1) -mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> -ts, --tensor_split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5) -r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md) -o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0) -v, --verbose (default: 0)
@ -51,6 +55,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave
For a description of the other options, see the [main example](../main/README.md). For a description of the other options, see the [main example](../main/README.md).
Note:
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
## Examples ## Examples
### Text generation with different models ### Text generation with different models

View File

@ -20,6 +20,7 @@
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml-sycl.h"
// utils // utils
static uint64_t get_time_ns() { static uint64_t get_time_ns() {
@ -120,6 +121,22 @@ static std::string get_gpu_info() {
id += "/"; id += "/";
} }
} }
#endif
#ifdef GGML_USE_SYCL
int device_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
if (device_list[i] >0 ){
char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
id += "/";
}
}
if (id.length() >2 ) {
id.pop_back();
}
#endif #endif
// TODO: other backends // TODO: other backends
return id; return id;
@ -161,6 +178,7 @@ struct cmd_params {
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q; std::vector<bool> mul_mat_q;
std::vector<std::vector<float>> tensor_split; std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* mul_mat_q */ {true}, /* mul_mat_q */ {true},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") { } else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params; return params;
@ -460,6 +488,7 @@ struct cmd_params_instance {
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap;
llama_model_params to_llama_mparams() const { llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
@ -468,6 +497,7 @@ struct cmd_params_instance {
mparams.split_mode = split_mode; mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu; mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data(); mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
return mparams; return mparams;
} }
@ -477,6 +507,7 @@ struct cmd_params_instance {
n_gpu_layers == other.n_gpu_layers && n_gpu_layers == other.n_gpu_layers &&
split_mode == other.split_mode && split_mode == other.split_mode &&
main_gpu == other.main_gpu && main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
tensor_split == other.tensor_split; tensor_split == other.tensor_split;
} }
@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & sm : params.split_mode) for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq, /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq, /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -565,6 +599,7 @@ struct test {
static const bool vulkan; static const bool vulkan;
static const bool kompute; static const bool kompute;
static const bool metal; static const bool metal;
static const bool sycl;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
static const std::string cpu_info; static const std::string cpu_info;
@ -583,6 +618,7 @@ struct test {
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap;
int n_prompt; int n_prompt;
int n_gen; int n_gen;
std::string test_time; std::string test_time;
@ -605,6 +641,7 @@ struct test {
no_kv_offload = inst.no_kv_offload; no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q; mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split; tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
n_prompt = inst.n_prompt; n_prompt = inst.n_prompt;
n_gen = inst.n_gen; n_gen = inst.n_gen;
// RFC 3339 date-time format // RFC 3339 date-time format
@ -654,25 +691,29 @@ struct test {
if (metal) { if (metal) {
return "Metal"; return "Metal";
} }
if (sycl) {
return GGML_SYCL_NAME;
}
if (gpu_blas) { if (gpu_blas) {
return "GPU BLAS"; return "GPU BLAS";
} }
if (blas) { if (blas) {
return "BLAS"; return "BLAS";
} }
return "CPU"; return "CPU";
} }
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload",
"mul_mat_q", "tensor_split", "mul_mat_q", "tensor_split", "use_mmap",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts"
@ -691,8 +732,8 @@ struct test {
return INT; return INT;
} }
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "mul_mat_q") { field == "mul_mat_q" || field == "use_mmap") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -720,13 +761,13 @@ struct test {
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(mul_mat_q), tensor_split_str, std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts()) std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -753,6 +794,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const std::string test::cpu_info = get_cpu_info(); const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info(); const std::string test::gpu_info = get_gpu_info();
@ -895,6 +937,9 @@ struct markdown_printer : public printer {
if (field == "no_kv_offload") { if (field == "no_kv_offload") {
return "nkvo"; return "nkvo";
} }
if (field == "use_mmap") {
return "mmap";
}
if (field == "tensor_split") { if (field == "tensor_split") {
return "ts"; return "ts";
} }
@ -903,43 +948,46 @@ struct markdown_printer : public printer {
void print_header(const cmd_params & params) override { void print_header(const cmd_params & params) override {
// select fields to print // select fields to print
fields.push_back("model"); fields.emplace_back("model");
fields.push_back("size"); fields.emplace_back("size");
fields.push_back("params"); fields.emplace_back("params");
fields.push_back("backend"); fields.emplace_back("backend");
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
if (!is_cpu_backend) { if (!is_cpu_backend) {
fields.push_back("n_gpu_layers"); fields.emplace_back("n_gpu_layers");
} }
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.push_back("n_threads"); fields.emplace_back("n_threads");
} }
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.push_back("n_batch"); fields.emplace_back("n_batch");
} }
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
fields.push_back("type_k"); fields.emplace_back("type_k");
} }
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
fields.push_back("type_v"); fields.emplace_back("type_v");
} }
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu"); fields.emplace_back("main_gpu");
} }
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.push_back("split_mode"); fields.emplace_back("split_mode");
} }
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q"); fields.emplace_back("mul_mat_q");
} }
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.push_back("no_kv_offload"); fields.emplace_back("no_kv_offload");
} }
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split"); fields.emplace_back("tensor_split");
} }
fields.push_back("test"); if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.push_back("t/s"); fields.emplace_back("use_mmap");
}
fields.emplace_back("test");
fields.emplace_back("t/s");
fprintf(fout, "|"); fprintf(fout, "|");
for (const auto & field : fields) { for (const auto & field : fields) {

View File

@ -14,14 +14,14 @@ Build with cmake or run `make llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example: After building, run: `./llava-cli` to see the usage. For example:
```sh ```sh
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
``` ```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
## Model conversion ## Model conversion
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally: - Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
```sh ```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
git clone https://huggingface.co/openai/clip-vit-large-patch14-336 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
``` ```
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: 2. Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh ```sh
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
``` ```
3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: 4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
```sh ```sh
python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
``` ```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./convert.py ../llava-v1.5-7b python ./convert.py ../llava-v1.5-7b

View File

@ -367,7 +367,7 @@ struct clip_ctx {
ggml_backend_buffer_t params_buffer = NULL; ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_buffer_t compute_buffer = NULL;
ggml_backend_t backend = NULL; ggml_backend_t backend = NULL;
ggml_allocr * compute_alloc = NULL; ggml_gallocr_t compute_alloc = NULL;
}; };
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
@ -405,31 +405,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
ggml_allocr_alloc(ctx->compute_alloc, inp_raw); ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
float * data = (float *)malloc(ggml_nbytes(inp_raw));
for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny;
GGML_ASSERT(nx == image_size && ny == image_size);
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
}
}
}
}
}
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
free(data);
}
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@ -438,13 +415,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_allocr_alloc(ctx->compute_alloc, embeddings); ggml_set_name(embeddings, "embeddings");
if (!ggml_allocr_is_measure(ctx->compute_alloc)) { ggml_set_input(embeddings);
void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
}
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
@ -453,15 +425,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_allocr_alloc(ctx->compute_alloc, positions); ggml_set_name(positions, "positions");
if (!ggml_allocr_is_measure(ctx->compute_alloc)) { ggml_set_input(positions);
int* positions_data = (int*)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
embeddings = embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
@ -560,15 +525,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
ggml_allocr_alloc(ctx->compute_alloc, patches); ggml_set_name(patches, "patches");
if (!ggml_allocr_is_measure(ctx->compute_alloc)) { ggml_set_input(patches);
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
// shape [1, 576, 1024] // shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1] // ne is whcn, ne = [1024, 576, 1, 1]
@ -809,7 +767,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
// data // data
size_t buffer_size = 0; size_t model_size = 0;
{ {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
@ -817,7 +775,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
enum ggml_type type = gguf_get_tensor_type(ctx, i); enum ggml_type type = gguf_get_tensor_type(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(meta, name); struct ggml_tensor * cur = ggml_get_tensor(meta, name);
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
buffer_size += tensor_size; model_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
@ -825,8 +783,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
} }
buffer_size += n_tensors * 128 /* CLIP PADDING */;
clip_ctx * new_clip = new clip_ctx; clip_ctx * new_clip = new clip_ctx;
// update projector type // update projector type
@ -886,12 +842,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0); printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
} }
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors); printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors // load tensors
{ {
@ -925,12 +881,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
// alloc memory and offload data // alloc memory and offload data
new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size); new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
ggml_allocr_alloc(alloc, cur);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
@ -949,7 +903,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
ggml_allocr_free(alloc);
fin.close(); fin.close();
} }
@ -1077,15 +1030,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// measure mem requirement and allocate // measure mem requirement and allocate
{ {
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch; clip_image_f32_batch batch;
batch.size = 1; batch.size = 1;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
ggml_allocr_free(new_clip->compute_alloc); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
@ -1267,12 +1217,72 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
GGML_ASSERT(batch_size == 1); // TODO: support multiple images GGML_ASSERT(batch_size == 1); // TODO: support multiple images
} }
// reset alloc buffer to clean the memory from previous invocations
ggml_allocr_reset(ctx->compute_alloc);
// build the inference graph // build the inference graph
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_allocr_alloc_graph(ctx->compute_alloc, gf); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1;
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
float * data = (float *)malloc(ggml_nbytes(inp_raw));
for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny;
GGML_ASSERT(nx == image_size && ny == image_size);
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
}
}
}
}
}
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
free(data);
}
{
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
}
{
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
if (ggml_backend_is_cpu(ctx->backend)) { if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);

View File

@ -71,7 +71,7 @@ def bytes_to_unicode():
return dict(zip(bs, cs)) return dict(zip(bs, cs))
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
ap.add_argument("--text-only", action="store_true", required=False, ap.add_argument("--text-only", action="store_true", required=False,

View File

@ -34,7 +34,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str; std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos); std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past); eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true; return true;
} }
@ -152,20 +152,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
size_t image_pos = prompt.find("<image>"); size_t image_pos = prompt.find("<image>");
if (image_pos != std::string::npos) { if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
size_t pos = 0;
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
user_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
system_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
printf("system_prompt: %s\n", system_prompt.c_str()); printf("system_prompt: %s\n", system_prompt.c_str());
printf("user_prompt: %s\n", user_prompt.c_str()); printf("user_prompt: %s\n", user_prompt.c_str());
} else { } else {

View File

@ -42,5 +42,5 @@ if len(clip_tensors) > 0:
torch.save(checkpoint, path) torch.save(checkpoint, path)
print("Done!") print("Done!")
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")

View File

@ -0,0 +1,3 @@
-r ../../requirements/requirements-convert.txt
pillow~=10.2.0
torch~=2.1.1

View File

@ -1,7 +1,9 @@
#include "common.h" #include "common.h"
#include "ggml.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
#include <cstdint>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
@ -73,6 +75,8 @@ int main(int argc, char ** argv){
int n_drafted = 0; int n_drafted = 0;
int n_accept = 0; int n_accept = 0;
int64_t t_draft_us = 0;
int n_past = inp.size(); int n_past = inp.size();
bool has_eos = false; bool has_eos = false;
@ -160,7 +164,7 @@ int main(int argc, char ** argv){
// generate n_pred tokens through prompt lookup // generate n_pred tokens through prompt lookup
auto prompt_lookup = [&]() -> void { auto prompt_lookup = [&]() -> void {
int inp_size = inp.size(); const int inp_size = inp.size();
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
const llama_token * ngram = &inp[inp_size - ngram_size]; const llama_token * ngram = &inp[inp_size - ngram_size];
@ -191,8 +195,12 @@ int main(int argc, char ** argv){
return; return;
}; };
const int64_t t_start_draft_us = ggml_time_us();
prompt_lookup(); prompt_lookup();
t_draft_us += ggml_time_us() - t_start_draft_us;
llama_decode(ctx, batch_tgt); llama_decode(ctx, batch_tgt);
++n_past; ++n_past;
@ -210,6 +218,8 @@ int main(int argc, char ** argv){
LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);

View File

@ -98,7 +98,7 @@ static void write_logfile(
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) { static void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting && g_params->interactive) {
is_interacting = true; is_interacting = true;
} else { } else {
console::cleanup(); console::cleanup();
@ -352,12 +352,12 @@ int main(int argc, char ** argv) {
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {
params.interactive_first = true; params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n"); params.antiprompt.emplace_back("### Instruction:\n\n");
} }
// similar for chatml mode // similar for chatml mode
else if (params.chatml) { else if (params.chatml) {
params.interactive_first = true; params.interactive_first = true;
params.antiprompt.push_back("<|im_start|>user\n"); params.antiprompt.emplace_back("<|im_start|>user\n");
} }
// enable interactive mode if interactive start is specified // enable interactive mode if interactive start is specified
@ -392,7 +392,8 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
} }
if (params.interactive) { // ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action; struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler; sigint_action.sa_handler = sigint_handler;
@ -405,7 +406,9 @@ int main(int argc, char ** argv) {
}; };
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
}
if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {

View File

@ -457,14 +457,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::ofstream logits_stream; std::ofstream logits_stream;
if (!params.logits_file.empty()) { if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str()); logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) { if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {}; return {};
} }
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8); logits_stream.write("_logits_", 8);
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx)); logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
} }
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
@ -881,7 +881,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t li = hs_cur.common_prefix; size_t li = hs_cur.common_prefix;
for (int s = 0; s < 4; ++s) { for (int s = 0; s < 4; ++s) {
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1])); eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
} }
++li; ++li;
} }
@ -1159,13 +1159,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
size_t li = n_base1 - 1; size_t li = n_base1 - 1;
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1])); eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
} }
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1; li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1])); eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
} }
} }
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@ -1524,7 +1524,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t li = cur_task.common_prefix; size_t li = cur_task.common_prefix;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1])); eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
} }
++li; ++li;
} }

View File

@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.include_layers.push_back(argv[i]); params.include_layers.emplace_back(argv[i]);
} else if (arg == "-L" || arg == "--exclude-layer") { } else if (arg == "-L" || arg == "--exclude-layer") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.exclude_layers.push_back(argv[i]); params.exclude_layers.emplace_back(argv[i]);
} else if (arg == "-t" || arg == "--type") { } else if (arg == "-t" || arg == "--type") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;

View File

@ -208,13 +208,13 @@ int main(int argc, char ** argv) {
} }
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) { } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
included_weights.push_back(argv[++arg_idx]); included_weights.emplace_back(argv[++arg_idx]);
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) { } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
excluded_weights.push_back(argv[++arg_idx]); excluded_weights.emplace_back(argv[++arg_idx]);
} else { } else {
usage(argv[0]); usage(argv[0]);
} }

View File

@ -137,6 +137,10 @@ node index.js
`temperature`: Adjust the randomness of the generated text (default: 0.8). `temperature`: Adjust the randomness of the generated text (default: 0.8).
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
`top_k`: Limit the next token selection to the K most probable tokens (default: 40). `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95). `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
@ -181,7 +185,7 @@ node index.js
`ignore_eos`: Ignore end of stream token and continue generating (default: false). `ignore_eos`: Ignore end of stream token and continue generating (default: false).
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
@ -264,7 +268,23 @@ Notice that each `probs` is an array of length `n_probs`.
It also accepts all the options of `/completion` except `stream` and `prompt`. It also accepts all the options of `/completion` except `stream` and `prompt`.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - **GET** `/props`: Return current server settings.
### Result JSON
```json
{
"assistant_name": "",
"user_name": "",
"default_generation_settings": { ... },
"total_slots": 1
}
```
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.

View File

@ -236,214 +236,250 @@ unsigned char completion_js[] = {
0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x24, 0x65, 0x6e, 0x74, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x60, 0x29, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61,
0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62,
0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20,
0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c,
0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77,
0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27,
0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c,
0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c,
0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65,
0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72,
0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61,
0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f,
0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20,
0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61,
0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20,
0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61,
0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20,
0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20,
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20,
0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e,
0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e,
0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d,
0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c,
0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73,
0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45,
0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d,
0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63,
0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74,
0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72,
0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63,
0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f,
0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65,
0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65,
0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45,
0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20,
0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29,
0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b,
0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b,
0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c,
0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61,
0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f,
0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72,
0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74,
0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f,
0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69,
0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73,
0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61,
0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45,
0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50,
0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e,
0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f,
0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d,
0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20,
0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65,
0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20,
0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29,
0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20,
0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70,
0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63,
0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d,
0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65,
0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20,
0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72,
0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20,
0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64,
0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e,
0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65,
0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x6d, 0x6f, 0x64, 0x65,
0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29, 0x2e, 0x74, 0x68, 0x65,
0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f,
0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20,
0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63,
0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, 0x29, 0x2e,
0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e,
0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70,
0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
}; };
unsigned int completion_js_len = 5346; unsigned int completion_js_len = 5782;

View File

@ -15,9 +15,13 @@
using json = nlohmann::json; using json = nlohmann::json;
inline static json oaicompat_completion_params_parse( inline static json oaicompat_completion_params_parse(
const json &body /* openai api json semantics */) const json &body, /* openai api json semantics */
const std::string &chat_template)
{ {
json llama_params; json llama_params;
std::string formatted_prompt = chat_template == "chatml"
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
llama_params["__oaicompat"] = true; llama_params["__oaicompat"] = true;
@ -30,7 +34,7 @@ inline static json oaicompat_completion_params_parse(
// https://platform.openai.com/docs/api-reference/chat/create // https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams; llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown")); llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["prompt"] = formatted_prompt;
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);

View File

@ -195,7 +195,8 @@ export const llamaComplete = async (params, controller, callback) => {
// Get the model info from the server. This is useful for getting the context window and so on. // Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async () => { export const llamaModelInfo = async () => {
if (!generation_settings) { if (!generation_settings) {
generation_settings = await fetch("/model.json").then(r => r.json()); const props = await fetch("/props").then(r => r.json());
generation_settings = props.default_generation_settings;
} }
return generation_settings; return generation_settings;
} }

View File

@ -36,6 +36,7 @@ struct server_params
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
std::string chat_template = "chatml";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
int32_t write_timeout = 600; int32_t write_timeout = 600;
@ -334,6 +335,7 @@ struct llama_server_context
// slots / clients // slots / clients
std::vector<llama_client_slot> slots; std::vector<llama_client_slot> slots;
json default_generation_settings_for_props;
llama_server_queue queue_tasks; llama_server_queue queue_tasks;
llama_server_response queue_results; llama_server_response queue_results;
@ -430,6 +432,9 @@ struct llama_server_context
slots.push_back(slot); slots.push_back(slot);
} }
default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1;
batch = llama_batch_init(n_ctx, 0, params.n_parallel); batch = llama_batch_init(n_ctx, 0, params.n_parallel);
// empty system prompt // empty system prompt
@ -529,6 +534,8 @@ struct llama_server_context
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
@ -619,18 +626,36 @@ struct llama_server_context
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
for (const auto &el : *logit_bias) for (const auto &el : *logit_bias)
{ {
if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) if (el.is_array() && el.size() == 2)
{
float bias;
if (el[1].is_number())
{
bias = el[1].get<float>();
}
else if (el[1].is_boolean() && !el[1].get<bool>())
{
bias = -INFINITY;
}
else
{
continue;
}
if (el[0].is_number_integer())
{ {
llama_token tok = el[0].get<llama_token>(); llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) if (tok >= 0 && tok < n_vocab)
{ {
if (el[1].is_number()) slot->sparams.logit_bias[tok] = bias;
{
slot->sparams.logit_bias[tok] = el[1].get<float>();
} }
else if (el[1].is_boolean() && !el[1].get<bool>()) }
else if (el[0].is_string())
{ {
slot->sparams.logit_bias[tok] = -INFINITY; auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias[tok] = bias;
} }
} }
} }
@ -983,11 +1008,6 @@ struct llama_server_context
queue_results.send(res); queue_results.send(res);
} }
json get_model_props()
{
return get_formated_generation(slots[0]);
}
json get_formated_generation(llama_client_slot &slot) json get_formated_generation(llama_client_slot &slot)
{ {
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
@ -998,6 +1018,8 @@ struct llama_server_context
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.params.seed}, {"seed", slot.params.seed},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
{"top_k", slot.sparams.top_k}, {"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p}, {"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.sparams.min_p},
@ -1159,13 +1181,30 @@ struct llama_server_context
task.multitask_id = multitask_id; task.multitask_id = multitask_id;
// when a completion task's prompt array is not a singleton, we split it into multiple requests // when a completion task's prompt array is not a singleton, we split it into multiple requests
if (task.data.count("prompt") && task.data.at("prompt").size() > 1) // otherwise, it's a single-prompt task, we actually queue it
{ // if there's numbers in the prompt array it will be treated as an array of tokens
split_multiprompt_task(task_id, task); if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
bool numbers = false;
for (const auto& e : task.data.at("prompt")) {
if (e.is_number()) {
numbers = true;
break;
}
} }
// otherwise, it's a single-prompt task, we actually queue it // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
// it will completely stall the server. I don't know where the bug for this is.
//
// if there are numbers, it needs to be treated like a single prompt,
// queue_tasks handles a mix of strings and numbers just fine.
if (numbers) {
queue_tasks.post(task); queue_tasks.post(task);
} else {
split_multiprompt_task(task_id, task);
}
} else {
queue_tasks.post(task);
}
} }
// for multiple images processing // for multiple images processing
@ -1247,7 +1286,10 @@ struct llama_server_context
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
{ {
int prompt_count = multiprompt_task.data.at("prompt").size(); int prompt_count = multiprompt_task.data.at("prompt").size();
assert(prompt_count > 1); if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
return;
}
// generate all the ID for subtask // generate all the ID for subtask
std::vector<int> subtask_ids(prompt_count); std::vector<int> subtask_ids(prompt_count);
@ -1569,10 +1611,6 @@ struct llama_server_context
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
} }
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
slot.cache_tokens = prompt_tokens; slot.cache_tokens = prompt_tokens;
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
@ -1586,6 +1624,10 @@ struct llama_server_context
} }
} }
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past}, {"n_past", slot.n_past},
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
@ -1836,6 +1878,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" --chat-template FORMAT_NAME");
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf("\n"); printf("\n");
} }
@ -1884,7 +1928,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.api_keys.push_back(argv[i]); sparams.api_keys.emplace_back(argv[i]);
} }
else if (arg == "--api-key-file") else if (arg == "--api-key-file")
{ {
@ -2160,7 +2204,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false; params.use_mmap = false;
} }
else if (arg == "--lora-scaled") else if (arg == "--lora-scaled")
@ -2176,7 +2220,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false; params.use_mmap = false;
} }
else if (arg == "--lora-base") else if (arg == "--lora-base")
@ -2267,6 +2311,21 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout); log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {}); LOG_INFO("logging to file is disabled.", {});
} }
else if (arg == "--chat-template")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
std::string value(argv[i]);
if (value != "chatml" && value != "llama2") {
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
invalid_param = true;
break;
}
sparams.chat_template = value;
}
else if (arg == "--override-kv") else if (arg == "--override-kv")
{ {
if (++i >= argc) { if (++i >= argc) {
@ -2318,7 +2377,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
} }
if (!params.kv_overrides.empty()) { if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(llama_model_kv_override()); params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;
} }
@ -2614,7 +2673,9 @@ int main(int argc, char **argv)
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = { json data = {
{ "user_name", llama.name_user.c_str() }, { "user_name", llama.name_user.c_str() },
{ "assistant_name", llama.name_assistant.c_str() } { "assistant_name", llama.name_assistant.c_str() },
{ "default_generation_settings", llama.default_generation_settings_for_props },
{ "total_slots", llama.params.n_parallel }
}; };
res.set_content(data.dump(), "application/json; charset=utf-8"); res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
@ -2718,13 +2779,13 @@ int main(int argc, char **argv)
// TODO: add mount point without "/v1" prefix -- how? // TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
{ {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) { if (!validate_api_key(req, res)) {
return; return;
} }
json data = oaicompat_completion_params_parse(json::parse(req.body)); json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
const int task_id = llama.queue_tasks.get_new_id(); const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id); llama.queue_results.add_waiting_task_id(task_id);
@ -2865,12 +2926,6 @@ int main(int argc, char **argv)
} }
}); });
svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
{
const json data = llama.get_model_props();
return res.set_content(data.dump(), "application/json; charset=utf-8");
});
svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
{ return res.set_content("", "application/json; charset=utf-8"); }); { return res.set_content("", "application/json; charset=utf-8"); });

View File

@ -167,6 +167,34 @@ static T json_value(const json &body, const std::string &key, const T &default_v
: default_value; : default_value;
} }
inline std::string format_llama2(std::vector<json> messages)
{
std::ostringstream output;
bool is_inside_turn = false;
for (auto it = messages.begin(); it != messages.end(); ++it) {
if (!is_inside_turn) {
output << "[INST] ";
}
std::string role = json_value(*it, "role", std::string("user"));
std::string content = json_value(*it, "content", std::string(""));
if (role == "system") {
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
is_inside_turn = true;
} else if (role == "user") {
output << content << " [/INST]";
is_inside_turn = true;
} else {
output << " " << content << " </s>";
is_inside_turn = false;
}
}
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
return output.str();
}
inline std::string format_chatml(std::vector<json> messages) inline std::string format_chatml(std::vector<json> messages)
{ {
std::ostringstream chatml_msgs; std::ostringstream chatml_msgs;
@ -180,6 +208,8 @@ inline std::string format_chatml(std::vector<json> messages)
chatml_msgs << "<|im_start|>assistant" << '\n'; chatml_msgs << "<|im_start|>assistant" << '\n';
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
return chatml_msgs.str(); return chatml_msgs.str();
} }

View File

@ -2,7 +2,7 @@
:: Copyright (C) 2024 Intel Corporation :: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT :: SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

View File

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h"
#include "common.h" #include "common.h"
#include "train.h" #include "train.h"
#include "llama.h" #include "llama.h"
@ -19,8 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const size_t tensor_alignment = 32;
struct my_llama_hparams { struct my_llama_hparams {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; uint32_t n_ctx = 512;
@ -58,7 +57,7 @@ struct my_llama_layer {
struct my_llama_model { struct my_llama_model {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
std::vector<uint8_t> data; ggml_backend_buffer_t data = NULL;
my_llama_hparams hparams; my_llama_hparams hparams;
@ -147,39 +146,6 @@ static void set_param_model(struct my_llama_model * model) {
} }
} }
static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
ggml_allocr_alloc(alloc, model->tok_embeddings);
ggml_allocr_alloc(alloc, model->norm);
ggml_allocr_alloc(alloc, model->output);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm);
ggml_allocr_alloc(alloc, layer.wq);
ggml_allocr_alloc(alloc, layer.wk);
ggml_allocr_alloc(alloc, layer.wv);
ggml_allocr_alloc(alloc, layer.wo);
ggml_allocr_alloc(alloc, layer.ffn_norm);
ggml_allocr_alloc(alloc, layer.w1);
ggml_allocr_alloc(alloc, layer.w2);
ggml_allocr_alloc(alloc, layer.w3);
}
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
ggml_allocr_alloc(alloc, model->norm->grad);
ggml_allocr_alloc(alloc, model->output->grad);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm->grad);
ggml_allocr_alloc(alloc, layer.wq->grad);
ggml_allocr_alloc(alloc, layer.wk->grad);
ggml_allocr_alloc(alloc, layer.wv->grad);
ggml_allocr_alloc(alloc, layer.wo->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
ggml_allocr_alloc(alloc, layer.w1->grad);
ggml_allocr_alloc(alloc, layer.w2->grad);
ggml_allocr_alloc(alloc, layer.w3->grad);
}
}
static void init_model(struct my_llama_model * model) { static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
@ -252,17 +218,8 @@ static void init_model(struct my_llama_model * model) {
set_param_model(model); set_param_model(model);
// measure data size
size_t size = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data // allocate data
struct ggml_allocr * alloc = NULL; model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
model->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
alloc_model(alloc, model);
} }
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@ -297,7 +254,7 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
static struct ggml_tensor * llama_build_train_graphs( static struct ggml_tensor * llama_build_train_graphs(
struct my_llama_model * model, struct my_llama_model * model,
struct ggml_allocr * alloc, ggml_gallocr_t alloc,
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_cgraph * gb, struct ggml_cgraph * gb,
@ -308,7 +265,8 @@ static struct ggml_tensor * llama_build_train_graphs(
const int n_tokens, const int n_tokens,
const int n_batch, const int n_batch,
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing,
const bool measure_only) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
@ -334,13 +292,7 @@ static struct ggml_tensor * llama_build_train_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
ggml_allocr_alloc(alloc, KQ_pos); ggml_set_input(KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -448,21 +400,31 @@ static struct ggml_tensor * llama_build_train_graphs(
// KQ_pos // KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_set_input(t36->grad);
ggml_allocr_alloc(alloc, t36->grad);
// allocating checkpoints in one block to reduce memory fragmentation // allocating checkpoints in one block to reduce memory fragmentation
// note: they will be freed in reverse order // note: they will be freed in reverse order
for (int i = 0; i < (int) checkpoints.size(); ++i) { for (int i = 0; i < (int) checkpoints.size(); ++i) {
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
ggml_allocr_alloc(alloc, checkpoints[i]); ggml_set_input(checkpoints[i]);
} }
} }
//int n_leafs_after = gb->n_leafs; //int n_leafs_after = gb->n_leafs;
//int n_nodes_after = gb->n_nodes; //int n_nodes_after = gb->n_nodes;
if (measure_only) {
// FIXME: will still allocate
ggml_gallocr_reserve(alloc, gb);
} else {
ggml_gallocr_alloc_graph(alloc, gb);
ggml_allocr_alloc_graph(alloc, gb); if (!measure_only) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
}
// remove the additional nodes and leafs // remove the additional nodes and leafs
for (int i = n_leafs_before; i < gb->n_leafs; ++i) { for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -1046,7 +1008,7 @@ int main(int argc, char ** argv) {
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
if (params.only_write_model) { if (params.only_write_model) {
save_train_files_data save_data; save_train_files_data save_data;
@ -1073,11 +1035,6 @@ int main(int argc, char ** argv) {
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch; int n_batch = params.common.n_batch;
std::vector<uint8_t> mem_input_data;
std::vector<uint8_t> mem_compute_data;
ggml_allocr * alloc = NULL;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params = {
ggml_tensor_overhead() * 2, // mem_size ggml_tensor_overhead() * 2, // mem_size
@ -1091,16 +1048,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors // measure required memory for input tensors
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); size_t max_input_size = ggml_backend_buffer_get_size(input_data);
ggml_allocr_alloc(alloc, tokens_input); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
ggml_allocr_alloc(alloc, target_probs);
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1127,7 +1078,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1140,9 +1091,10 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
true
); );
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
if (max_compute_size < best_compute_size) { if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size; best_compute_size = max_compute_size;
best_order = gf->order; best_order = gf->order;
@ -1157,9 +1109,8 @@ int main(int argc, char ** argv) {
"invalid"); "invalid");
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1172,7 +1123,8 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
false
); );
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;

18
flake.lock generated
View File

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1704982712, "lastModified": 1706830856,
"narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=", "narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "07f6395285469419cf9d078f59b5b49993198c00", "rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1706191920, "lastModified": 1707268954,
"narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=", "narHash": "sha256-2en1kvde3cJVc3ZnTy8QeD2oKcseLFjYPLKhIGDanQ0=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "ae5c332cbb5827f6b1f02572496b141021de335f", "rev": "f8e2ebd66d097614d51a56a755450d4ae1632df1",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -37,11 +37,11 @@
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"dir": "lib", "dir": "lib",
"lastModified": 1703961334, "lastModified": 1706550542,
"narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=", "narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9", "rev": "97b17f32362e475016f942bbdfda4a4a72a8a652",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@ -157,6 +157,7 @@
mpi-cpu = config.packages.default.override { useMpi = true; }; mpi-cpu = config.packages.default.override { useMpi = true; };
mpi-cuda = config.packages.default.override { useMpi = true; }; mpi-cuda = config.packages.default.override { useMpi = true; };
vulkan = config.packages.default.override { useVulkan = true; };
} }
// lib.optionalAttrs (system == "x86_64-linux") { // lib.optionalAttrs (system == "x86_64-linux") {
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp; rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;

File diff suppressed because it is too large Load Diff

View File

@ -6,88 +6,62 @@
extern "C" { extern "C" {
#endif #endif
struct ggml_backend; typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
struct ggml_backend_buffer; typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
struct ggml_backend_buffer_type; typedef struct ggml_backend * ggml_backend_t;
//
// Legacy API
//
typedef struct ggml_allocr * ggml_allocr_t;
// initialize allocator for use with CPU backend only
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
// initialize allocator for use with ggml-backend
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
//
// ggml-backend v2 API
//
// Separate tensor and graph allocator objects
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
// The original API is kept as a wrapper around the new API
// Tensor allocator // Tensor allocator
typedef struct ggml_tallocr * ggml_tallocr_t; typedef struct ggml_tallocr * ggml_tallocr_t;
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
// Graph allocator // Graph allocator
/*
Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch));
// allocate the graph
struct ggml_cgraph * graph = build_graph(batch);
ggml_gallocr_alloc_graph(galloc, graph);
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
// evaluate the graph
ggml_backend_graph_compute(backend, graph);
*/
// special tensor flags for use with the graph allocator:
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
// ggml_set_output(): output tensors are never freed and never overwritten
typedef struct ggml_gallocr * ggml_gallocr_t; typedef struct ggml_gallocr * ggml_gallocr_t;
GGML_API ggml_gallocr_t ggml_gallocr_new(void); GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); // pre-allocate buffers from a measure graph - does not allocate or modify the graph
GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); // call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
// Allocate tensors from the allocators given by the hash table // automatic reallocation if the topology changes when using a single buffer
GGML_API void ggml_gallocr_alloc_graph_n( // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
ggml_gallocr_t galloc, GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
struct ggml_cgraph * graph,
struct ggml_hash_set hash_set,
ggml_tallocr_t * hash_node_talloc);
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
#ifdef __cplusplus #ifdef __cplusplus
} }

File diff suppressed because it is too large Load Diff

View File

@ -84,7 +84,8 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_cpu_init(void); GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer // Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
@ -129,11 +130,7 @@ extern "C" {
// in build_graph: // in build_graph:
build_graph(...) { build_graph(...) {
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) // manually assign nodes to a backend (optional, should not be needed in most cases)
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
ggml_allocr_alloc(alloc_cpu, tensor);
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...); struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_node_backend(sched, node, backend_gpu); ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
} }
@ -163,20 +160,19 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph // Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs // Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute // Set a callback to be called for each resulting node during graph compute

View File

@ -151,8 +151,8 @@
#define CUDA_USE_TENSOR_CORES #define CUDA_USE_TENSOR_CORES
#endif #endif
// max batch size to use MMQ kernels when tensor cores are available #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
#define MMQ_MAX_BATCH_SIZE 32 #define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
@ -5337,41 +5337,81 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda> template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
const int row = blockIdx.x*blockDim.y + threadIdx.y; // tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
if (row >= nrows) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
return; constexpr int nwarps = 1;
} constexpr int rows_per_cuda_block = 1;
#else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const int blocks_per_row = ncols / qk; const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
const int blocks_per_warp = vdr * WARP_SIZE / qi; const int row0 = rows_per_cuda_block*blockIdx.x;
const int blocks_per_row_x = ncols_x / qk;
const int blocks_per_col_y = nrows_y / QK8_1;
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
// partial sum for each thread // partial sum for each thread
float tmp = 0.0f; float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
const block_q_t * x = (const block_q_t *) vx; const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy; const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
const int ibx = row*blocks_per_row + i; // x block index const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx // x block quant index when casting the quants to int
const int kqs = vdr * (tid % (qi/vdr));
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int #pragma unroll
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp[j][i] += vec_dot_q_cuda(
&x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
}
}
}
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
if (threadIdx.y > 0) {
#pragma unroll
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
}
}
}
__syncthreads();
if (threadIdx.y > 0) {
return;
} }
// sum up partial sums and write back result // sum up partial sums and write back result
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int j = 0; j < ncols_y; ++j) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); #pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
for (int l = 0; l < nwarps-1; ++l) {
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
}
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
} }
if (threadIdx.x == 0) { if (threadIdx.x < rows_per_cuda_block) {
dst[row] = tmp; dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
}
} }
} }
@ -7373,121 +7413,85 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
GGML_ASSERT(ncols % QK4_0 == 0); static void mul_mat_vec_q_cuda(
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const void * vx, const void * vy, float * dst,
const dim3 block_nums(block_num_y, 1, 1); const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols_x % qk == 0);
GGML_ASSERT(ncols % QK4_1 == 0); GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { int id;
GGML_ASSERT(ncols % QK5_0 == 0); CUDA_CHECK(cudaGetDevice(&id));
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { int64_t nwarps = 1;
GGML_ASSERT(ncols % QK5_1 == 0); int64_t rows_per_cuda_block = 1;
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
GGML_ASSERT(ncols % QK8_0 == 0); switch(ncols_y) {
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; case 1:
const dim3 block_nums(block_num_y, 1, 1); nwarps = 4;
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); rows_per_cuda_block = 1;
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1> break;
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); case 2:
} case 3:
case 4:
nwarps = 4;
rows_per_cuda_block = 2;
break;
case 5:
case 6:
case 7:
case 8:
nwarps = 2;
rows_per_cuda_block = 2;
break;
default:
GGML_ASSERT(false);
break;
}
}
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
const dim3 block_nums(nblocks, 1, 1);
const dim3 block_dims(WARP_SIZE, nwarps, 1);
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { switch (ncols_y) {
GGML_ASSERT(ncols % QK_K == 0); case 1:
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
const dim3 block_nums(block_num_y, 1, 1); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); break;
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1> case 2:
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
} <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { case 3:
GGML_ASSERT(ncols % QK_K == 0); mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
const dim3 block_nums(block_num_y, 1, 1); break;
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); case 4:
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1> mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
} break;
case 5:
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
GGML_ASSERT(ncols % QK_K == 0); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; break;
const dim3 block_nums(block_num_y, 1, 1); case 6:
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1> <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); break;
} case 7:
mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
GGML_ASSERT(ncols % QK_K == 0); break;
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; case 8:
const dim3 block_nums(block_num_y, 1, 1); mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1> break;
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); default:
} GGML_ASSERT(false);
break;
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { }
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
} }
static void ggml_mul_mat_q4_0_q8_1_cuda( static void ggml_mul_mat_q4_0_q8_1_cuda(
@ -9011,7 +9015,7 @@ static void ggml_cuda_op_mul_mat_q(
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
// the main device has a larger memory buffer to hold the results from all GPUs // the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
switch (src0->type) { switch (src0->type) {
@ -9142,50 +9146,73 @@ static void ggml_cuda_op_mul_mat_vec_q(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) { const int64_t src1_padded_row_size, cudaStream_t stream) {
GGML_ASSERT(ggml_nrows(src1) == 1);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
const int64_t ne10 = src1->ne[0];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
int id;
CUDA_CHECK(cudaGetDevice(&id));
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS:
mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_XS:
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
@ -10293,7 +10320,7 @@ static __global__ void k_compute_batched_ptrs(
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
} }
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
@ -10451,39 +10478,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
int64_t min_compute_capability = INT_MAX; int64_t min_compute_capability = INT_MAX;
bool any_pascal_with_slow_fp16 = false;
if (split) { if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split; auto & tensor_split = buft_ctx->tensor_split;
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { // skip devices that are not going to do any work:
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
continue;
}
if (min_compute_capability > g_device_caps[id].cc) {
min_compute_capability = g_device_caps[id].cc; min_compute_capability = g_device_caps[id].cc;
} }
if (g_device_caps[id].cc == 610) {
any_pascal_with_slow_fp16 = true;
}
} }
} else { } else {
min_compute_capability = g_device_caps[g_main_device].cc; min_compute_capability = g_device_caps[g_main_device].cc;
any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
} }
// check data types and tensor shapes for custom matrix multiplication kernels:
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1; const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
bool use_mul_mat_q = ggml_is_quantized(src0->type);
#ifdef CUDA_USE_TENSOR_CORES #ifdef CUDA_USE_TENSOR_CORES
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3; use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES #endif // CUDA_USE_TENSOR_CORES
#else #else
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA; // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
#ifdef CUDA_USE_TENSOR_CORES #ifdef CUDA_USE_TENSOR_CORES
// when tensor cores are available, use them for large batch size // when tensor cores are available, use them for large batch size
// ref: https://github.com/ggerganov/llama.cpp/pull/3776 // ref: https://github.com/ggerganov/llama.cpp/pull/3776
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE); use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // CUDA_USE_TENSOR_CORES #endif // CUDA_USE_TENSOR_CORES
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type); // if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV
// debug helpers // debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@ -10501,33 +10558,16 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
ggml_cuda_mul_mat_vec_nc(src0, src1, dst); ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch // KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) { } else if (use_dequantize_mul_mat_vec) {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
#ifdef GGML_CUDA_FORCE_DMMV
const bool use_mul_mat_vec_q = false;
#else
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
#endif // GGML_CUDA_FORCE_DMMV
if (use_mul_mat_vec_q) {
// NOTE: this kernel does not support ggml_nrows(src1) > 1
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
} else {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
} } else if (use_mul_mat_vec_q) {
} else { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
if (use_mul_mat_q) { } else if (use_mul_mat_q) {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
} else { } else {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} }
}
} else {
GGML_ASSERT(false);
}
} }
#if 0 #if 0

View File

@ -19,6 +19,7 @@ extern "C" {
// fall back to the _Static_assert C11 keyword. // fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop // if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976 // ref: https://stackoverflow.com/a/53923785/4039976
#ifndef __cplusplus
#ifndef static_assert #ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg) #define static_assert(cond, msg) _Static_assert(cond, msg)
@ -26,6 +27,7 @@ extern "C" {
#define static_assert(cond, msg) struct global_scope_noop_trick #define static_assert(cond, msg) struct global_scope_noop_trick
#endif #endif
#endif #endif
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))

View File

@ -703,6 +703,7 @@ static bool ggml_metal_graph_compute(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
@autoreleasepool {
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
edesc.dispatchType = MTLDispatchTypeSerial; edesc.dispatchType = MTLDispatchTypeSerial;
@ -2395,6 +2396,7 @@ static bool ggml_metal_graph_compute(
[[MTLCaptureManager sharedCaptureManager] stopCapture]; [[MTLCaptureManager sharedCaptureManager] stopCapture];
} }
}
return true; return true;
} }

View File

@ -49,6 +49,8 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define UNUSED GGML_UNUSED
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
#ifdef _MSC_VER
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif
#if !defined(__aarch64__) #if !defined(__aarch64__)
// 64-bit compatibility // 64-bit compatibility
@ -2381,7 +2394,10 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
uint8_t L[QK_K]; uint8_t L[QK_K];
uint8_t Laux[32]; uint8_t Laux[32];
uint8_t Ls[QK_K/32];
uint8_t Lm[QK_K/32];
float weights[32]; float weights[32];
float sw[QK_K/32];
float mins[QK_K/32]; float mins[QK_K/32];
float scales[QK_K/32]; float scales[QK_K/32];
@ -2389,11 +2405,9 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
float sum_x2 = 0; float sum_x2 = 0;
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
float sigma2 = sum_x2/QK_K; float sigma2 = 2*sum_x2/QK_K;
float av_x = sqrtf(sigma2); float av_x = sqrtf(sigma2);
float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
if (quant_weights) { if (quant_weights) {
const float * qw = quant_weights + QK_K*i + 32*j; const float * qw = quant_weights + QK_K*i + 32*j;
@ -2401,25 +2415,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
} else { } else {
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
} }
float sumw = 0;
for (int l = 0; l < 32; ++l) sumw += weights[l];
sw[j] = sumw;
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
float scale = scales[j];
if (scale > max_scale) {
max_scale = scale;
}
float min = mins[j];
if (min > max_min) {
max_min = min;
}
} }
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
float inv_min = max_min > 0 ? 63.f/max_min : 0.f; float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
uint8_t ls = nearest_int(inv_scale*scales[j]); uint8_t ls = Ls[j];
uint8_t lm = nearest_int(inv_min*mins[j]); uint8_t lm = Lm[j];
ls = MIN(63, ls);
lm = MIN(63, lm);
if (j < 4) { if (j < 4) {
y[i].scales[j] = ls; y[i].scales[j] = ls;
y[i].scales[j+4] = lm; y[i].scales[j+4] = lm;
@ -2429,8 +2435,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
y[i].scales[j-0] |= ((lm >> 4) << 6); y[i].scales[j-0] |= ((lm >> 4) << 6);
} }
} }
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); y[i].d = GGML_FP32_TO_FP16(d_block);
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); y[i].dmin = GGML_FP32_TO_FP16(m_block);
uint8_t sc, m; uint8_t sc, m;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
@ -2688,20 +2694,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
const int nb = n_per_row / QK_K; const int nb = n_per_row / QK_K;
uint8_t L[QK_K]; uint8_t L[QK_K];
uint8_t Laux[32];
uint8_t Ls[QK_K/32];
uint8_t Lm[QK_K/32];
float mins[QK_K/32]; float mins[QK_K/32];
float scales[QK_K/32]; float scales[QK_K/32];
float sw[QK_K/32];
float weights[32]; float weights[32];
uint8_t Laux[32];
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float sum_x2 = 0; float sum_x2 = 0;
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
float sigma2 = sum_x2/QK_K; float sigma2 = 2*sum_x2/QK_K;
float av_x = sqrtf(sigma2); float av_x = sqrtf(sigma2);
float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
if (quant_weights) { if (quant_weights) {
const float * qw = quant_weights + QK_K*i + 32*j; const float * qw = quant_weights + QK_K*i + 32*j;
@ -2709,22 +2716,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
} else { } else {
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
} }
float sumw = 0;
for (int l = 0; l < 32; ++l) sumw += weights[l];
sw[j] = sumw;
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
float scale = scales[j];
if (scale > max_scale) {
max_scale = scale;
}
float min = mins[j];
if (min > max_min) {
max_min = min;
}
} }
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
float inv_min = max_min > 0 ? 63.f/max_min : 0.f; float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
uint8_t ls = nearest_int(inv_scale*scales[j]); uint8_t ls = Ls[j];
uint8_t lm = nearest_int(inv_min*mins[j]); uint8_t lm = Lm[j];
ls = MIN(63, ls); ls = MIN(63, ls);
lm = MIN(63, lm); lm = MIN(63, lm);
if (j < 4) { if (j < 4) {
@ -2736,8 +2740,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
y[i].scales[j-0] |= ((lm >> 4) << 6); y[i].scales[j-0] |= ((lm >> 4) << 6);
} }
} }
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); y[i].d = GGML_FP32_TO_FP16(d_block);
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); y[i].dmin = GGML_FP32_TO_FP16(m_block);
uint8_t sc, m; uint8_t sc, m;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
@ -3675,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
} }
#endif #endif
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
assert(n % qk == 0); assert(n % qk == 0);
#if defined(__ARM_FEATURE_MATMUL_INT8)
assert((nrc == 2) || (nrc == 1));
#else
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_0 * restrict x = vx; const block_q4_0 * restrict x = vx;
const block_q8_0 * restrict y = vy; const block_q8_0 * restrict y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q4_0 * restrict vx0 = vx;
const block_q4_0 * restrict vx1 = vx + bx;
const block_q8_0 * restrict vy0 = vy;
const block_q8_0 * restrict vy1 = vy + by;
float32x4_t sumv0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
const block_q4_0 * restrict b_x0 = &vx0[i];
const block_q4_0 * restrict b_x1 = &vx1[i];
const block_q8_0 * restrict b_y0 = &vy0[i];
const block_q8_0 * restrict b_y1 = &vy1[i];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s8b = vdupq_n_s8(0x8);
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
// 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// sub 8
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
// load y
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
l1, r1)), l2, r2)), l3, r3))), scale);
}
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
vst1_f32(s, vget_low_f32(sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
return;
}
#endif
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -3965,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
#endif #endif
} }
void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1; const int qk = QK8_1;
const int nb = n / qk; const int nb = n / qk;
assert(n % qk == 0); assert(n % qk == 0);
#if defined(__ARM_FEATURE_MATMUL_INT8)
assert((nrc == 2) || (nrc == 1));
#else
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_1 * restrict x = vx; const block_q4_1 * restrict x = vx;
const block_q8_1 * restrict y = vy; const block_q8_1 * restrict y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q4_1 * restrict vx0 = vx;
const block_q4_1 * restrict vx1 = vx + bx;
const block_q8_1 * restrict vy0 = vy;
const block_q8_1 * restrict vy1 = vy + by;
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t summs0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
const block_q4_1 * restrict b_x0 = &vx0[i];
const block_q4_1 * restrict b_x1 = &vx1[i];
const block_q8_1 * restrict b_y0 = &vy0[i];
const block_q8_1 * restrict b_y1 = &vy1[i];
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
summs0 += summs_t;
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
// 4-bit -> 8-bit
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// load y
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
// mmla into int32x4_t
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
l1, r1)), l2, r2)), l3, r3))), scale);
}
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
sumv2 = sumv2 + summs0;
vst1_f32(s, vget_low_f32(sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
return;
}
#endif
// TODO: add WASM SIMD // TODO: add WASM SIMD
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv0 = vdupq_n_f32(0.0f);
@ -4105,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
#endif #endif
} }
void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
assert(n % qk == 0); assert(n % qk == 0);
assert(qk == QK5_0); assert(qk == QK5_0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_0 * restrict x = vx; const block_q5_0 * restrict x = vx;
const block_q8_0 * restrict y = vy; const block_q8_0 * restrict y = vy;
@ -4391,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
#endif #endif
} }
void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1; const int qk = QK8_1;
const int nb = n / qk; const int nb = n / qk;
assert(n % qk == 0); assert(n % qk == 0);
assert(qk == QK5_1); assert(qk == QK5_1);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_1 * restrict x = vx; const block_q5_1 * restrict x = vx;
const block_q8_1 * restrict y = vy; const block_q8_1 * restrict y = vy;
@ -4690,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
#endif #endif
} }
void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
assert(n % qk == 0); assert(n % qk == 0);
#if defined(__ARM_FEATURE_MATMUL_INT8)
assert((nrc == 2) || (nrc == 1));
#else
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q8_0 * restrict x = vx; const block_q8_0 * restrict x = vx;
const block_q8_0 * restrict y = vy; const block_q8_0 * restrict y = vy;
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q8_0 * restrict vx0 = vx;
const block_q8_0 * restrict vx1 = vx + bx;
const block_q8_0 * restrict vy0 = vy;
const block_q8_0 * restrict vy1 = vy + by;
float32x4_t sumv0 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
const block_q8_0 * restrict b_x0 = &vx0[i];
const block_q8_0 * restrict b_y0 = &vy0[i];
const block_q8_0 * restrict b_x1 = &vx1[i];
const block_q8_0 * restrict b_y1 = &vy1[i];
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
// load y
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
l1, r1)), l2, r2)), l3, r3))), scale);
}
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
vst1_f32(s, vget_low_f32(sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
return;
}
#endif
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -4793,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
} }
#if QK_K == 256 #if QK_K == 256
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q2_K * restrict x = vx; const block_q2_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -5169,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q2_K * restrict x = vx; const block_q2_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -5427,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
#endif #endif
#if QK_K == 256 #if QK_K == 256
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const uint32_t kmask1 = 0x03030303; const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f; const uint32_t kmask2 = 0x0f0f0f0f;
@ -5947,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q3_K * restrict x = vx; const block_q3_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -6290,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
#endif #endif
#if QK_K == 256 #if QK_K == 256
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_K * restrict x = vx; const block_q4_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -6646,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
#endif #endif
} }
#else #else
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_K * restrict x = vx; const block_q4_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -6889,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
#endif #endif
#if QK_K == 256 #if QK_K == 256
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_K * restrict x = vx; const block_q5_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -7309,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_K * restrict x = vx; const block_q5_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -7575,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#if QK_K == 256 #if QK_K == 256
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q6_K * restrict x = vx; const block_q6_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -8007,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q6_K * restrict x = vx; const block_q6_K * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -8337,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
}; };
void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq2_xxs * restrict x = vx; const block_iq2_xxs * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -8460,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
#endif #endif
} }
void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq2_xs * restrict x = vx; const block_iq2_xs * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -8680,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
} }
// TODO // TODO
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0); assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq3_xxs * restrict x = vx; const block_iq3_xxs * restrict x = vx;
const block_q8_K * restrict y = vy; const block_q8_K * restrict y = vy;
@ -8707,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
q8b = ggml_vld1q_s8_x4(q8); q8 += 64; q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
q3 += 16; q3 += 16;
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
@ -9048,8 +9346,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
int8_t L[32]; int8_t L[32];
int8_t Laux[32]; int8_t Laux[32];
float waux[32]; float waux[32];
bool is_on_grid[4];
bool is_on_grid_aux[4];
uint8_t block_signs[4]; uint8_t block_signs[4];
uint32_t q2[2*(QK_K/32)]; uint32_t q2[2*(QK_K/32)];
@ -9099,10 +9395,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
memset(L, 0, 32); memset(L, 0, 32);
continue; continue;
} }
float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
float eff_max = scale*kMaxQ;
float best = 0; float best = 0;
float scale = max/(2*kMaxQ-1); for (int is = -6; is <= 6; ++is) {
for (int is = -9; is <= 9; ++is) { float id = (2*kMaxQ-1+is*0.1f)/eff_max;
float id = (2*kMaxQ-1+is*0.1f)/max;
float this_scale = 1/id; float this_scale = 1/id;
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
@ -9112,9 +9409,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
uint16_t u = 0; uint16_t u = 0;
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
int grid_index = kmap_q2xs[u]; int grid_index = kmap_q2xs[u];
is_on_grid_aux[k] = true;
if (grid_index < 0) { if (grid_index < 0) {
is_on_grid_aux[k] = false;
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
} }
@ -9128,16 +9423,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
} }
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
scale = sumqx/sumq2; best = scale*sumqx; scale = sumqx/sumq2; best = scale*sumqx;
for (int i = 0; i < 32; ++i) L[i] = Laux[i]; memcpy(L, Laux, 32);
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
} }
} }
int n_not_ongrid = 0; if (scale > 0) {
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
if (n_not_ongrid > 0 && scale > 0) {
float id = 1/scale; float id = 1/scale;
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
if (is_on_grid[k]) continue;
uint16_t u = 0; uint16_t u = 0;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
@ -9193,49 +9484,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
float d = max_scale/31; float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d); y[ibl].d = GGML_FP32_TO_FP16(d);
float id = 1/d; float id = 1/d;
float sumqx = 0, sumq2 = 0;
for (int ib = 0; ib < QK_K/32; ++ib) { for (int ib = 0; ib < QK_K/32; ++ib) {
int l = nearest_int(0.5f*(id*scales[ib]-1)); int l = nearest_int(0.5f*(id*scales[ib]-1));
l = MAX(0, MIN(15, l)); l = MAX(0, MIN(15, l));
q2[2*ib+1] |= ((uint32_t)l << 28); q2[2*ib+1] |= ((uint32_t)l << 28);
const float * xb = xbl + 32*ib;
const float * qw = quant_weights + QK_K*ibl + 32*ib;
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
const float db = d * (1 + 2*l);
uint32_t u = 0;
for (int k = 0; k < 4; ++k) {
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
const float * xk = xb + 8*k;
const float * wk = weight + 8*k;
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
float best_mse = 0; int best_index = aux8[k];
for (int j = 0; j < 8; ++j) {
float diff = db * grid[j] * signs[j] - xk[j];
best_mse += wk[j] * diff * diff;
}
for (int idx = 0; idx < 256; ++idx) {
grid = (const uint8_t *)(kgrid_q2xs + idx);
float mse = 0;
for (int j = 0; j < 8; ++j) {
float diff = db * grid[j] * signs[j] - xk[j];
mse += wk[j] * diff * diff;
}
if (mse < best_mse) {
best_mse = mse; best_index = idx;
}
}
u |= (best_index << 8*k);
grid = (const uint8_t *)(kgrid_q2xs + best_index);
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
for (int j = 0; j < 8; ++j) {
float q = db * grid[j] * signs[j];
sumqx += wk[j] * q * xk[j];
sumq2 += wk[j] * q * q;
}
}
q2[2*ib] = u;
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
} }
memcpy(y[ibl].qs, q2, QK_K/4); memcpy(y[ibl].qs, q2, QK_K/4);
} }

View File

@ -191,70 +191,74 @@ typedef struct {
} block_iq3_xxs; } block_iq3_xxs;
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
#ifdef __cplusplus
extern "C" {
#endif
// Quantization // Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
// Dot product // Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// //
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
void iq2xs_free_impl(int grid_size); void iq2xs_free_impl(int grid_size);
void iq3xs_init_impl(int grid_size); void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size); void iq3xs_free_impl(int grid_size);
#ifdef __cplusplus
}
#endif

View File

@ -337,6 +337,7 @@ namespace dpct
} }
size_t get_global_mem_size() const { return _global_mem_size; } size_t get_global_mem_size() const { return _global_mem_size; }
size_t get_local_mem_size() const { return _local_mem_size; } size_t get_local_mem_size() const { return _local_mem_size; }
size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
/// Returns the maximum clock rate of device's global memory in kHz. If /// Returns the maximum clock rate of device's global memory in kHz. If
/// compiler does not support this API then returns default value 3200000 kHz. /// compiler does not support this API then returns default value 3200000 kHz.
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
@ -398,6 +399,10 @@ namespace dpct
{ {
_local_mem_size = local_mem_size; _local_mem_size = local_mem_size;
} }
void set_max_mem_alloc_size(size_t max_mem_alloc_size)
{
_max_mem_alloc_size = max_mem_alloc_size;
}
void set_max_work_group_size(int max_work_group_size) void set_max_work_group_size(int max_work_group_size)
{ {
_max_work_group_size = max_work_group_size; _max_work_group_size = max_work_group_size;
@ -465,6 +470,7 @@ namespace dpct
int _max_register_size_per_work_group; int _max_register_size_per_work_group;
size_t _global_mem_size; size_t _global_mem_size;
size_t _local_mem_size; size_t _local_mem_size;
size_t _max_mem_alloc_size;
size_t _max_nd_range_size[3]; size_t _max_nd_range_size[3];
int _max_nd_range_size_i[3]; int _max_nd_range_size_i[3];
uint32_t _device_id; uint32_t _device_id;
@ -516,6 +522,7 @@ namespace dpct
dev.get_info<sycl::info::device::max_work_group_size>()); dev.get_info<sycl::info::device::max_work_group_size>());
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>()); prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>()); prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) #if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
@ -644,6 +651,11 @@ namespace dpct
return get_device_info().get_global_mem_size(); return get_device_info().get_global_mem_size();
} }
size_t get_max_mem_alloc_size() const
{
return get_device_info().get_max_mem_alloc_size();
}
/// Get the number of bytes of free and total memory on the SYCL device. /// Get the number of bytes of free and total memory on the SYCL device.
/// \param [out] free_memory The number of bytes of free memory on the SYCL device. /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
/// \param [out] total_memory The number of bytes of total memory on the SYCL device. /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
@ -1354,6 +1366,7 @@ namespace dpct
} }
#else #else
return q.memcpy(to_ptr, from_ptr, size, dep_events); return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
#endif // DPCT_USM_LEVEL_NONE #endif // DPCT_USM_LEVEL_NONE
} }
@ -1655,7 +1668,7 @@ namespace dpct
using Ty = typename DataType<T>::T2; using Ty = typename DataType<T>::T2;
Ty s_h; Ty s_h;
if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host) detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
.wait(); .wait();
else else
s_h = *reinterpret_cast<const Ty *>(s); s_h = *reinterpret_cast<const Ty *>(s);
@ -1679,6 +1692,20 @@ namespace dpct
int ldb, const void *beta, void *c, int ldc) int ldb, const void *beta, void *c, int ldc)
{ {
#ifndef __INTEL_MKL__ #ifndef __INTEL_MKL__
GGML_UNUSED(q);
GGML_UNUSED(a_trans);
GGML_UNUSED(b_trans);
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(alpha);
GGML_UNUSED(a);
GGML_UNUSED(lda);
GGML_UNUSED(b);
GGML_UNUSED(ldb);
GGML_UNUSED(beta);
GGML_UNUSED(c);
GGML_UNUSED(ldc);
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
"Project does not support this API."); "Project does not support this API.");
#else #else
@ -1818,7 +1845,7 @@ namespace dpct
template <typename T> template <typename T>
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
int logical_sub_group_size = 32) unsigned int logical_sub_group_size = 32)
{ {
unsigned int id = g.get_local_linear_id(); unsigned int id = g.get_local_linear_id();
unsigned int start_index = unsigned int start_index =
@ -2148,6 +2175,7 @@ namespace dpct
} }
#else #else
return q.memcpy(to_ptr, from_ptr, size, dep_events); return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
#endif // DPCT_USM_LEVEL_NONE #endif // DPCT_USM_LEVEL_NONE
} }
@ -2928,7 +2956,6 @@ void ggml_sycl_set_main_device(int main_device);
void ggml_sycl_set_mul_mat_q(bool mul_mat_q); void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
void ggml_sycl_set_scratch_size(size_t scratch_size); void ggml_sycl_set_scratch_size(size_t scratch_size);
void ggml_sycl_free_scratch(void); void ggml_sycl_free_scratch(void);
int ggml_sycl_get_device_count(void);
void ggml_sycl_get_device_description(int device, char * description, size_t description_size); void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
bool ggml_backend_is_sycl(ggml_backend_t backend); bool ggml_backend_is_sycl(ggml_backend_t backend);
int ggml_backend_sycl_get_device(ggml_backend_t backend); int ggml_backend_sycl_get_device(ggml_backend_t backend);
@ -3291,7 +3318,7 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
std::ofstream logfile; std::ofstream logfile;
logfile.open(filename); logfile.open(filename);
// printf("local buf element %d\n", total_elements); // printf("local buf element %d\n", total_elements);
for(int i=0; i<total_elements; i++){ for(size_t i=0; i<total_elements; i++){
if((i+1)%20 ==0) logfile <<std::endl; if((i+1)%20 ==0) logfile <<std::endl;
else logfile << local_buf[i] <<" "; else logfile << local_buf[i] <<" ";
} }
@ -3385,6 +3412,7 @@ static __dpct_inline__ float warp_reduce_max(float x,
static __dpct_inline__ float op_repeat(const float a, const float b) { static __dpct_inline__ float op_repeat(const float a, const float b) {
return b; return b;
GGML_UNUSED(a);
} }
static __dpct_inline__ float op_add(const float a, const float b) { static __dpct_inline__ float op_add(const float a, const float b) {
@ -7665,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
*dsti = *xi; *dsti = *xi;
} }
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
const sycl::half *xi = (const sycl::half *)cxi;
float *dsti = (float *)cdsti;
*dsti = *xi;
}
static void cpy_1_i16_i16(const char * cxi, char * cdsti) { static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
const int16_t *xi = (const int16_t *)cxi; const int16_t *xi = (const int16_t *)cxi;
int16_t *dsti = (int16_t *)cdsti; int16_t *dsti = (int16_t *)cdsti;
@ -7681,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
template <cpy_kernel_t cpy_1> template <cpy_kernel_t cpy_1>
static void cpy_f32_f16(const char * cx, char * cdst, const int ne, static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const sycl::nd_item<3> &item_ct1) { const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2); item_ct1.get_local_id(2);
@ -7693,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
// then combine those indices with the corresponding byte offsets to get the total offsets // then combine those indices with the corresponding byte offsets to get the total offsets
const int i02 = i / (ne00*ne01); const int i03 = i/(ne00 * ne01 * ne02);
const int i01 = (i - i02*ne01*ne00) / ne00; const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int i00 = i - i02*ne01*ne00 - i01*ne00; const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int i12 = i / (ne10*ne11); const int i13 = i/(ne10 * ne11 * ne12);
const int i11 = (i - i12*ne10*ne11) / ne10; const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int i10 = i - i12*ne10*ne11 - i11*ne10; const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
cpy_1(cx + x_offset, cdst + dst_offset); cpy_1(cx + x_offset, cdst + dst_offset);
} }
@ -7795,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
template <cpy_kernel_t cpy_blck, int qk> template <cpy_kernel_t cpy_blck, int qk>
static void cpy_f32_q(const char * cx, char * cdst, const int ne, static void cpy_f32_q(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const sycl::nd_item<3> &item_ct1) { const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2)) * item_ct1.get_local_id(2)) *
qk; qk;
@ -7806,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
return; return;
} }
const int i02 = i / (ne00*ne01); const int i03 = i/(ne00 * ne01 * ne02);
const int i01 = (i - i02*ne01*ne00) / ne00; const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int i00 = (i - i02*ne01*ne00 - i01*ne00); const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int i12 = i / (ne10*ne11); const int i13 = i/(ne10 * ne11 * ne12);
const int i11 = (i - i12*ne10*ne11) / ne10; const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset); cpy_blck(cx + x_offset, cdst + dst_offset);
} }
@ -8219,7 +8258,8 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
} }
static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta, template <typename T>
static void im2col_kernel(const float *x, T *dst, int offset_delta,
int IW, int IH, int OW, int KW, int KH, int IW, int IH, int OW, int KW, int KH,
int pelements, int CHW, int s0, int s1, int p0, int pelements, int CHW, int s0, int s1, int p0,
int p1, int d0, int d1, int p1, int d0, int d1,
@ -10570,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@ -10586,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01, cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
nb02, ne10, ne11, nb10, nb11, nb12, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1); item_ct1);
}); });
} }
@ -10595,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@ -10611,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
nb02, ne10, ne11, nb10, nb11, nb12, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1); item_ct1);
}); });
} }
@ -10620,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
GGML_ASSERT(ne % QK8_0 == 0); GGML_ASSERT(ne % QK8_0 == 0);
@ -10632,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, 1)), sycl::range<3>(1, 1, 1)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>( cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
ne10, ne11, nb10, nb11, nb12, item_ct1); nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1);
}); });
} }
static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
GGML_ASSERT(ne % QK4_0 == 0); GGML_ASSERT(ne % QK4_0 == 0);
@ -10651,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, 1)), sycl::range<3>(1, 1, 1)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>( cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
ne10, ne11, nb10, nb11, nb12, item_ct1); nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1);
}); });
} }
static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
GGML_ASSERT(ne % QK4_1 == 0); GGML_ASSERT(ne % QK4_1 == 0);
@ -10670,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, 1)), sycl::range<3>(1, 1, 1)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>( cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
ne10, ne11, nb10, nb11, nb12, item_ct1); nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1);
}); });
} }
static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@ -10693,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
nb02, ne10, ne11, nb10, nb11, nb12, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1); item_ct1);
}); });
} }
@ -10702,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@ -10718,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01, cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
nb02, ne10, ne11, nb10, nb11, nb12, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1); item_ct1);
}); });
} }
@ -10727,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
const int ne00, const int ne01, const int ne00, const int ne01,
const int nb00, const int nb01, const int ne02, const int nb00,
const int nb02, const int ne10, const int nb01, const int nb02,
const int ne11, const int nb10, const int nb03, const int ne10,
const int nb11, const int nb12, const int ne11, const int ne12,
const int nb10, const int nb11,
const int nb12, const int nb13,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
@ -10743,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01, cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
nb02, ne10, ne11, nb10, nb11, nb12, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
item_ct1); item_ct1);
}); });
} }
@ -10991,7 +11050,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
}); });
} }
static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH, template <typename T>
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
int OW, int OH, int KW, int KH, int IC, int OW, int OH, int KW, int KH, int IC,
int offset_delta, int s0, int s1, int p0, int offset_delta, int s0, int s1, int p0,
int p1, int d0, int d1, int p1, int d0, int d1,
@ -11008,7 +11068,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
parallel_elements, (IC * KH * KW), s0, s1, p0, parallel_elements, (IC * KH * KW), s0, s1, p0,
p1, d0, d1, item_ct1); p1, d0, d1, item_ct1);
}); });
@ -11145,10 +11205,10 @@ DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
// g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES]; // g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES];
static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0}; static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0};
static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0}; static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0};
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try { static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try {
GGML_UNUSED(size);
GGML_UNUSED(actual_size);
return NULL; return NULL;
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
@ -11312,10 +11372,10 @@ void ggml_init_sycl() try {
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_SYCL_FP16) #if defined(GGML_SYCL_F16)
fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__); fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
#else #else
fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__); fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif #endif
@ -11338,9 +11398,8 @@ void ggml_init_sycl() try {
if(id!=user_device_id) continue; if(id!=user_device_id) continue;
device_inx++; device_inx++;
int device_vmm = 0;
g_device_caps[device_inx].vmm = !!device_vmm; g_device_caps[device_inx].vmm = 0;
g_device_caps[device_inx].device_id = id; g_device_caps[device_inx].device_id = id;
g_sycl_device_id2index[id].index = device_inx; g_sycl_device_id2index[id].index = device_inx;
@ -11348,18 +11407,12 @@ void ggml_init_sycl() try {
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(id)))); prop, dpct::dev_mgr::instance().get_device(id))));
// fprintf(stderr,
// " Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
// prop.get_name(), prop.get_major_version(),
// prop.get_minor_version(), device_vmm ? "yes" : "no");
g_tensor_split[device_inx] = total_vram; g_tensor_split[device_inx] = total_vram;
total_vram += prop.get_global_mem_size(); total_vram += prop.get_global_mem_size();
g_device_caps[device_inx].cc = g_device_caps[device_inx].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version(); 100 * prop.get_major_version() + 10 * prop.get_minor_version();
// printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
} }
device_inx = -1; device_inx = -1;
for (int id = 0; id < g_all_sycl_device_count; ++id) { for (int id = 0; id < g_all_sycl_device_count; ++id) {
@ -11525,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
} }
char * dst_ptr = (char *) dst; char * dst_ptr = (char *) dst;
const int64_t ne0 = src->ne[0]; GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
const int64_t nb0 = src->nb[0]; GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
const int64_t nb1 = src->nb[1];
const int64_t nb2 = src->nb[2];
const int64_t nb3 = src->nb[3];
const enum ggml_type type = src->type; const enum ggml_type type = src->type;
const int64_t ts = ggml_type_size(type); const int64_t ts = ggml_type_size(type);
const int64_t bs = ggml_blck_size(type); const int64_t bs = ggml_blck_size(type);
@ -12095,7 +12145,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
const int64_t src1_ncols, const int64_t src1_padded_row_size, const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream) { const dpct::queue_ptr &stream) {
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_BINARY_OP_LOCALS
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
@ -12114,8 +12165,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
} else { } else {
src1_dfloat = src1_dfloat_a.alloc(ne00); src1_dfloat = src1_dfloat_a.alloc(ne00);
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat, ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1, ne00, ne00, ne01, ne02, nb00, nb01, nb02,
sizeof(sycl::half), 0, 0, stream); nb03, ne10, ne11, ne12, nb10, nb11, nb12,
nb13, stream);
} }
} }
#else #else
@ -12195,7 +12247,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
// ldc == nrows of the matrix that cuBLAS writes into // ldc == nrows of the matrix that cuBLAS writes into
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
const int compute_capability = g_device_caps[id].cc;
#ifdef GGML_SYCL_F16 #ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check bool use_fp16 = true; // TODO(Yu) SYCL capability check
#else #else
@ -12372,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
@ -12404,7 +12453,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@ -12427,8 +12476,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, if (dst->type == GGML_TYPE_F16) {
IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
} else {
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
}
(void) src0; (void) src0;
(void) src0_dd; (void) src0_dd;
@ -12680,7 +12732,7 @@ static void ggml_sycl_set_peer_access(const int n_tokens) {
continue; continue;
} }
int can_access_peer; // int can_access_peer;
// SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other)); // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
// if (can_access_peer) { // if (can_access_peer) {
// if (enable_peer_access) { // if (enable_peer_access) {
@ -12701,16 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
ggml_sycl_op_mul_mat_t op, ggml_sycl_op_mul_mat_t op,
const bool convert_src1_to_q8_1) try { const bool convert_src1_to_q8_1) try {
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nrows0 = ggml_nrows(src0);
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t nrows1 = ggml_nrows(src1); const int64_t nrows1 = ggml_nrows(src1);
GGML_ASSERT(ne03 == ne13); GGML_ASSERT(ne03 == ne13);
@ -13281,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nb01 = src0->nb[1]; GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t nb11 = src1->nb[1]; GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1); const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst); const int64_t ne = ggml_nelements(dst);
@ -13599,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
const int64_t ne01 = src00->ne[1];
const int64_t ne02 = src00->ne[2];
const int64_t ne03 = src00->ne[3];
//const int64_t nb01 = src00->nb[1]; //const int64_t nb01 = src00->nb[1];
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
//const int64_t nb11 = src1->nb[1]; //const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1); const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst); const int64_t ne = ggml_nelements(dst);
@ -13801,13 +13828,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get(); src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get(); dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get();
const dpct::memcpy_direction src1_kind =
src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
: dpct::device_to_device;
const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
? dpct::device_to_host
: dpct::device_to_device;
for (int32_t row_id = 0; row_id < n_as; ++row_id) { for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2]; const struct ggml_tensor * src0_row = dst->src[row_id + 2];
@ -13891,21 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t ne01 = src0->ne[1];
GGML_ASSERT(src0->ne[3] == 1);
const int64_t nb00 = src0->nb[0];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(src1->ne[3] == 1);
const int64_t nb10 = src1->nb[0];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2];
SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(ggml_sycl_set_device(g_main_device));
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@ -13917,21 +13923,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index]; char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else { } else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -14493,6 +14499,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
return true; return true;
} }
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
int max_compute_units = -1;
for(int i=0;i<max_len;i++) id_list[i] = 0;
int device_count = dpct::dev_mgr::instance().device_count();
for(int id=0; id< device_count; id++){
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu()) continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
}
for(int id=0;id< device_count;id++){
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu()) continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
id_list[id] = 1;
}
}
return;
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
int ggml_sycl_get_device_count() try { int ggml_sycl_get_device_count() try {
int device_count; int device_count;
if (CHECK_TRY_ERROR(device_count = if (CHECK_TRY_ERROR(device_count =
@ -14507,7 +14544,7 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
void ggml_sycl_get_device_description(int device, char *description, GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
size_t description_size) try { size_t description_size) try {
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@ -14758,6 +14795,12 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty
UNUSED(buft); UNUSED(buft);
} }
static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
return dpct::get_current_device().get_max_mem_alloc_size();
UNUSED(buft);
}
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0; int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor); int64_t row_high = ggml_nrows(tensor);
@ -14788,7 +14831,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .get_name = */ ggml_backend_sycl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
/* .is_host = */ nullptr, /* .is_host = */ nullptr,

View File

@ -22,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,24 +8,29 @@ extern "C" {
#endif #endif
#define GGML_VK_NAME "Vulkan" #define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_init(void); GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers(void); GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node); GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor); void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif #endif
GGML_API void ggml_vk_graph_cleanup(void); GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
GGML_API void ggml_vk_free_cpu_assist(void);
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void); GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);

223
ggml.c
View File

@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = { [GGML_TYPE_I8] = {
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.is_quantized = false, .is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32, .vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
}, },
[GGML_TYPE_F16] = { [GGML_TYPE_F16] = {
.type_name = "f16", .type_name = "f16",
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16, .vec_dot_type = GGML_TYPE_F16,
.nrows = 1,
}, },
[GGML_TYPE_Q4_0] = { [GGML_TYPE_Q4_0] = {
.type_name = "q4_0", .type_name = "q4_0",
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
.vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
#else
.nrows = 1,
#endif
}, },
[GGML_TYPE_Q4_1] = { [GGML_TYPE_Q4_1] = {
.type_name = "q4_1", .type_name = "q4_1",
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
.vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
#else
.nrows = 1,
#endif
}, },
[4] = { // GGML_TYPE_Q4_2 [4] = { // GGML_TYPE_Q4_2
.type_name = "DEPRECATED", .type_name = "DEPRECATED",
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL, .from_float_reference = NULL,
.vec_dot = NULL, .vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT, .vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
}, },
[5] = { // GGML_TYPE_Q4_3 [5] = { // GGML_TYPE_Q4_3
.type_name = "DEPRECATED", .type_name = "DEPRECATED",
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL, .from_float_reference = NULL,
.vec_dot = NULL, .vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT, .vec_dot_type = GGML_TYPE_COUNT,
.nrows = 1,
}, },
[GGML_TYPE_Q5_0] = { [GGML_TYPE_Q5_0] = {
.type_name = "q5_0", .type_name = "q5_0",
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
.vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot = ggml_vec_dot_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
}, },
[GGML_TYPE_Q5_1] = { [GGML_TYPE_Q5_1] = {
.type_name = "q5_1", .type_name = "q5_1",
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
.vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot = ggml_vec_dot_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1,
}, },
[GGML_TYPE_Q8_0] = { [GGML_TYPE_Q8_0] = {
.type_name = "q8_0", .type_name = "q8_0",
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
.vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
#else
.nrows = 1,
#endif
}, },
[GGML_TYPE_Q8_1] = { [GGML_TYPE_Q8_1] = {
.type_name = "q8_1", .type_name = "q8_1",
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_1, .from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
.nrows = 1,
}, },
[GGML_TYPE_Q2_K] = { [GGML_TYPE_Q2_K] = {
.type_name = "q2_K", .type_name = "q2_K",
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
.vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot = ggml_vec_dot_q2_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_Q3_K] = { [GGML_TYPE_Q3_K] = {
.type_name = "q3_K", .type_name = "q3_K",
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
.vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot = ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_Q4_K] = { [GGML_TYPE_Q4_K] = {
.type_name = "q4_K", .type_name = "q4_K",
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
.vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_Q5_K] = { [GGML_TYPE_Q5_K] = {
.type_name = "q5_K", .type_name = "q5_K",
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
.vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot = ggml_vec_dot_q5_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_Q6_K] = { [GGML_TYPE_Q6_K] = {
.type_name = "q6_K", .type_name = "q6_K",
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
.vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot = ggml_vec_dot_q6_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_IQ2_XXS] = { [GGML_TYPE_IQ2_XXS] = {
.type_name = "iq2_xxs", .type_name = "iq2_xxs",
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL, .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_IQ2_XS] = { [GGML_TYPE_IQ2_XS] = {
.type_name = "iq2_xs", .type_name = "iq2_xs",
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL, .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_IQ3_XXS] = { [GGML_TYPE_IQ3_XXS] = {
.type_name = "iq3_xxs", .type_name = "iq3_xxs",
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference, .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
}, },
[GGML_TYPE_Q8_K] = { [GGML_TYPE_Q8_K] = {
.type_name = "q8_K", .type_name = "q8_K",
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
#ifdef GGML_SIMD #ifdef GGML_SIMD
float sumf = 0.0f; float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1)); const int np = (n & ~(GGML_F32_STEP - 1));
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
*s = sumf; *s = sumf;
} }
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
ggml_float sumf = 0.0; ggml_float sumf = 0.0;
#if defined(GGML_SIMD) #if defined(GGML_SIMD)
@ -1515,7 +1557,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
#endif #endif
} }
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
@ -2405,7 +2447,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
ggml_vk_init(); ggml_vk_init_cpu_assist();
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
ggml_init_sycl(); ggml_init_sycl();
#endif #endif
@ -2532,7 +2574,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
size_t max_size = 0; size_t max_size = 0;
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
max_size = MAX(max_size, ggml_nbytes(tensor)); size_t bytes = ggml_nbytes(tensor);
max_size = MAX(max_size, bytes);
} }
return max_size; return max_size;
@ -2668,7 +2711,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.nb =*/ { 0, 0, 0, 0 }, /*.nb =*/ { 0, 0, 0, 0 },
/*.op =*/ GGML_OP_NONE, /*.op =*/ GGML_OP_NONE,
/*.op_params =*/ { 0 }, /*.op_params =*/ { 0 },
/*.is_param =*/ false, /*.flags =*/ 0,
/*.grad =*/ NULL, /*.grad =*/ NULL,
/*.src =*/ { NULL }, /*.src =*/ { NULL },
/*.perf_runs =*/ 0, /*.perf_runs =*/ 0,
@ -6626,7 +6669,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
void ggml_set_param( void ggml_set_param(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor) { struct ggml_tensor * tensor) {
tensor->is_param = true; tensor->flags |= GGML_TENSOR_FLAG_PARAM;
GGML_ASSERT(tensor->grad == NULL); GGML_ASSERT(tensor->grad == NULL);
tensor->grad = ggml_dup_tensor(ctx, tensor); tensor->grad = ggml_dup_tensor(ctx, tensor);
@ -10109,6 +10152,7 @@ static void ggml_compute_forward_mul_mat(
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const vec_dot_num_rows = type_traits[type].nrows;
GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11); GGML_ASSERT(ne1 == ne11);
@ -10276,12 +10320,23 @@ static void ggml_compute_forward_mul_mat(
const int64_t blck_0 = 16; const int64_t blck_0 = 16;
const int64_t blck_1 = 16; const int64_t blck_1 = 16;
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
int64_t nrc = vec_dot_num_rows;
// TODO: currently the mmla kernels support only even numbered rows/cols.
// this check can be removed once they are extended to support odd numbered rows/cols too
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
nrc = 1;
}
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
// attempt to reduce false-sharing (does not seem to make a difference) // attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16]; // 16 * 2, accounting for mmla kernels
float tmp[32];
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
const int64_t i13 = (ir1/(ne12*ne1)); const int64_t i13 = (ir1/(ne12*ne1));
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@ -10304,17 +10359,19 @@ static void ggml_compute_forward_mul_mat(
(src1_cont || src1->type != vec_dot_type (src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13)); : (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//} //}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
}
for (int cn = 0; cn < nrc; ++cn) {
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
} }
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
} }
} }
} }
@ -10503,7 +10560,7 @@ static void ggml_compute_forward_mul_mat_id(
//} //}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
} }
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
} }
@ -11687,7 +11744,7 @@ static void ggml_compute_forward_soft_max_back_f32(
// linear runtime, no additional memory // linear runtime, no additional memory
float dot_y_dy = 0; float dot_y_dy = 0;
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_cpy_f32 (nc, dx, dy);
ggml_vec_acc1_f32(nc, dx, -dot_y_dy); ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
ggml_vec_mul_f32 (nc, dx, dx, y); ggml_vec_mul_f32 (nc, dx, dx, y);
@ -12007,8 +12064,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
) { ) {
// start and end correction dims // start and end correction dims
dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
dims[0] = MAX(0, start);
dims[1] = MIN(n_dims - 1, end);
} }
static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f32(
@ -12486,9 +12545,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
const int i1n = i10*ne11; const int i1n = i10*ne11;
for (int i00 = 0; i00 < ne00; i00++) { for (int i00 = 0; i00 < ne00; i00++) {
float v = 0; float v = 0;
ggml_vec_dot_f16(ne02, &v, ggml_vec_dot_f16(ne02, &v, 0,
(ggml_fp16_t *) wdata_src + i1n, (ggml_fp16_t *) wdata_src + i1n, 0,
(ggml_fp16_t *) wdata_kernel + i00*ne02); (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
dst_data[i10*s0 + i00] += v; dst_data[i10*s0 + i00] += v;
} }
} }
@ -12583,9 +12642,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
const int i1n = i10*ne11; const int i1n = i10*ne11;
for (int i00 = 0; i00 < ne00; i00++) { for (int i00 = 0; i00 < ne00; i00++) {
float v = 0; float v = 0;
ggml_vec_dot_f32(ne02, &v, ggml_vec_dot_f32(ne02, &v, 0,
wdata_src + i1n, wdata_src + i1n, 0,
wdata_kernel + i00*ne02); wdata_kernel + i00*ne02, 0, 1);
dst_data[i10*s0 + i00] += v; dst_data[i10*s0 + i00] += v;
} }
} }
@ -12900,9 +12959,9 @@ static void ggml_compute_forward_conv_transpose_2d(
for (int i01 = 0; i01 < ne01; i01++) { for (int i01 = 0; i01 < ne01; i01++) {
for (int i00 = 0; i00 < ne00; i00++) { for (int i00 = 0; i00 < ne00; i00++) {
float v = 0; float v = 0;
ggml_vec_dot_f16(ne03, &v, ggml_vec_dot_f16(ne03, &v, 0,
wdata_src + i1n, wdata_src + i1n, 0,
wdata_kernel + i01*ne00*ne03 + i00*ne03); wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
} }
} }
@ -13331,9 +13390,9 @@ static void ggml_compute_forward_flash_attn_f32(
const int i1 = ik1; const int i1 = ik1;
ggml_vec_dot_f32(neq0, ggml_vec_dot_f32(neq0,
S + i1, S + i1, 0,
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
} }
// scale // scale
@ -13416,9 +13475,9 @@ static void ggml_compute_forward_flash_attn_f32(
const int iv3 = iq3; const int iv3 = iq3;
ggml_vec_dot_f32(masked_begin, ggml_vec_dot_f32(masked_begin,
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
S); S, 0, 1);
} }
} }
} }
@ -13521,9 +13580,9 @@ static void ggml_compute_forward_flash_attn_f16(
const int i1 = ik1; const int i1 = ik1;
ggml_vec_dot_f16(neq0, ggml_vec_dot_f16(neq0,
S + i1, S + i1, 0,
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
} }
} else { } else {
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@ -13625,9 +13684,9 @@ static void ggml_compute_forward_flash_attn_f16(
const int iv3 = iq3; const int iv3 = iq3;
ggml_vec_dot_f16(nev0, ggml_vec_dot_f16(nev0,
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
S16); S16, 0, 1);
} }
} else { } else {
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@ -13796,9 +13855,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
} }
ggml_vec_dot_f16(D, ggml_vec_dot_f16(D,
&s, &s, 0,
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), (ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
Q16); Q16, 0, 1);
s = s*scale + mv; s = s*scale + mv;
@ -13960,9 +14019,9 @@ static void ggml_compute_forward_flash_ff_f16(
const int i1 = ib01; const int i1 = ib01;
ggml_vec_dot_f16(nea0, ggml_vec_dot_f16(nea0,
S + i1, S + i1, 0,
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
} }
ggml_vec_add_f32(neb01, S, S, (float *) b1->data); ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@ -13985,9 +14044,9 @@ static void ggml_compute_forward_flash_ff_f16(
for (int64_t ic = 0; ic < nec01; ++ic) { for (int64_t ic = 0; ic < nec01; ++ic) {
ggml_vec_dot_f16(neb01, ggml_vec_dot_f16(neb01,
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
S16); S16, 0, 1);
} }
ggml_vec_add_f32(nec01, ggml_vec_add_f32(nec01,
@ -14174,9 +14233,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
const int i1 = ik1; const int i1 = ik1;
ggml_vec_dot_f32(neq0, ggml_vec_dot_f32(neq0,
S + i1, S + i1, 0,
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
} }
// scale // scale
@ -14321,7 +14380,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
// S = SM * (S - dot(SM, S)) // S = SM * (S - dot(SM, S))
float dot_SM_gradSM = 0; float dot_SM_gradSM = 0;
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
ggml_vec_mul_f32 (masked_begin, S, S, SM); ggml_vec_mul_f32 (masked_begin, S, S, SM);
@ -15158,10 +15217,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward(params, tensor); const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) { if (skip_cpu) {
ggml_vk_check_results_1(params, tensor); ggml_vk_check_results_1_cpu_assist(params, tensor);
} }
#endif #endif
if (skip_cpu) { if (skip_cpu) {
@ -15623,7 +15682,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
return NULL; return NULL;
} }
if (node->is_param) { if (node->flags & GGML_TENSOR_FLAG_PARAM) {
return node; return node;
} }
@ -15657,7 +15716,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
clone->op = node->op; clone->op = node->op;
clone->grad = node->grad; clone->grad = node->grad;
clone->is_param = node->is_param; clone->flags = node->flags;
clone->extra = node->extra; clone->extra = node->extra;
for (int k = 0; k < GGML_MAX_DIMS; ++k) { for (int k = 0; k < GGML_MAX_DIMS; ++k) {
clone->nb[k] = node->nb[k]; clone->nb[k] = node->nb[k];
@ -16690,7 +16749,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
for (int i = 0; i < gf->n_nodes; i++) { for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
if (node->is_param) { if (node->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
ggml_build_forward_expand(gb, node->grad); ggml_build_forward_expand(gb, node->grad);
} }
@ -16962,7 +17021,7 @@ struct ggml_compute_state_shared {
atomic_int node_n; // active graph node atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase atomic_int node_task; // active graph node task phase
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
void * abort_callback_data; void * abort_callback_data;
}; };
@ -17589,12 +17648,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
} }
ggml_vk_preallocate_buffers(); ggml_vk_preallocate_buffers_cpu_assist();
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
} }
#endif #endif
@ -17650,7 +17709,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup(); ggml_vk_graph_cleanup_cpu_assist();
#endif #endif
// performance stats (graph) // performance stats (graph)
@ -18182,7 +18241,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i, i,
node->ne[0], node->ne[1], node->ne[2], node->ne[0], node->ne[1], node->ne[2],
ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0, (double) node->perf_time_us / 1000.0,
@ -18275,7 +18334,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
continue; continue;
} }
if (node->is_param) { if (node->flags & GGML_TENSOR_FLAG_PARAM) {
snprintf(color, sizeof(color), "yellow"); snprintf(color, sizeof(color), "yellow");
} else if (node->grad) { } else if (node->grad) {
if (ggml_graph_find(gf, node)) { if (ggml_graph_find(gf, node)) {
@ -18449,7 +18508,7 @@ static enum ggml_opt_result ggml_opt_adam(
int np = 0; int np = 0;
int64_t nx = 0; int64_t nx = 0;
for (int i = 0; i < gf->n_nodes; ++i) { for (int i = 0; i < gf->n_nodes; ++i) {
if (gf->nodes[i]->is_param) { if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
GGML_ASSERT(np < GGML_MAX_PARAMS); GGML_ASSERT(np < GGML_MAX_PARAMS);
@ -18702,7 +18761,7 @@ static enum ggml_opt_result linesearch_backtracking(
} }
// compute the initial gradient in the search direction // compute the initial gradient in the search direction
ggml_vec_dot_f32(nx, &dginit, g, d); ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
// make sure that d points to a descent direction // make sure that d points to a descent direction
if (0 < dginit) { if (0 < dginit) {
@ -18752,7 +18811,7 @@ static enum ggml_opt_result linesearch_backtracking(
return count; return count;
} }
ggml_vec_dot_f32(nx, &dg, g, d); ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
// check the Wolfe condition // check the Wolfe condition
if (dg < params->lbfgs.wolfe * dginit) { if (dg < params->lbfgs.wolfe * dginit) {
@ -18812,7 +18871,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
int np = 0; int np = 0;
int nx = 0; int nx = 0;
for (int i = 0; i < gf->n_nodes; ++i) { for (int i = 0; i < gf->n_nodes; ++i) {
if (gf->nodes[i]->is_param) { if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
GGML_ASSERT(np < GGML_MAX_PARAMS); GGML_ASSERT(np < GGML_MAX_PARAMS);
@ -19013,8 +19072,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
// ys = y^t \cdot s -> 1 / \rho. // ys = y^t \cdot s -> 1 / \rho.
// yy = y^t \cdot y. // yy = y^t \cdot y.
// //
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
lm_ys[end[0]] = ys; lm_ys[end[0]] = ys;
@ -19033,7 +19092,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
for (int i = 0; i < bound; ++i) { for (int i = 0; i < bound; ++i) {
j[0] = (j[0] + m - 1) % m; j[0] = (j[0] + m - 1) % m;
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
lm_alpha[j[0]] /= lm_ys[j[0]]; lm_alpha[j[0]] /= lm_ys[j[0]];
// q_{i} = q_{i+1} - \alpha_{i} y_{i} // q_{i} = q_{i+1} - \alpha_{i} y_{i}
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@ -19043,7 +19102,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
for (int i = 0; i < bound; ++i) { for (int i = 0; i < bound; ++i) {
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
beta /= lm_ys[j[0]]; beta /= lm_ys[j[0]];
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@ -19287,6 +19346,16 @@ enum ggml_opt_result ggml_opt_resume_g(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void ggml_set_input(struct ggml_tensor * tensor) {
tensor->flags |= GGML_TENSOR_FLAG_INPUT;
}
void ggml_set_output(struct ggml_tensor * tensor) {
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
}
////////////////////////////////////////////////////////////////////////////////
void ggml_quantize_init(enum ggml_type type) { void ggml_quantize_init(enum ggml_type type) {
ggml_critical_section_start(); ggml_critical_section_start();
@ -20931,4 +21000,12 @@ int ggml_cpu_has_vsx(void) {
#endif #endif
} }
int ggml_cpu_has_matmul_int8(void) {
#if defined(__ARM_FEATURE_MATMUL_INT8)
return 1;
#else
return 0;
#endif
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

26
ggml.h
View File

@ -511,6 +511,12 @@ extern "C" {
GGML_LOG_LEVEL_DEBUG = 5 GGML_LOG_LEVEL_DEBUG = 5
}; };
enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1,
GGML_TENSOR_FLAG_OUTPUT = 2,
GGML_TENSOR_FLAG_PARAM = 4,
};
// ggml object // ggml object
struct ggml_object { struct ggml_object {
size_t offs; size_t offs;
@ -544,7 +550,7 @@ extern "C" {
// op params - allocated as int32_t for alignment // op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
bool is_param; int32_t flags;
struct ggml_tensor * grad; struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC]; struct ggml_tensor * src[GGML_MAX_SRC];
@ -568,6 +574,11 @@ extern "C" {
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// Abort callback
// If not NULL, called before ggml computation
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -577,7 +588,7 @@ extern "C" {
int n_threads; int n_threads;
// abort ggml_graph_compute when true // abort ggml_graph_compute when true
bool (*abort_callback)(void * data); ggml_abort_callback abort_callback;
void * abort_callback_data; void * abort_callback_data;
}; };
@ -2107,6 +2118,12 @@ extern "C" {
ggml_opt_callback callback, ggml_opt_callback callback,
void * callback_data); void * callback_data);
//
// tensor flags
//
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
// //
// quantization // quantization
// //
@ -2293,6 +2310,7 @@ extern "C" {
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
@ -2306,7 +2324,8 @@ extern "C" {
#endif #endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef struct { typedef struct {
const char * type_name; const char * type_name;
@ -2318,6 +2337,7 @@ extern "C" {
ggml_from_float_t from_float_reference; ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
} ggml_type_traits_t; } ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

View File

@ -157,19 +157,10 @@ struct block_q6_K
# Dequant functions # Dequant functions
shader_f16_dequant_func = """ shader_f16_dequant_func = """
#define DEQUANT_FUNC f16vec2 v = f16vec2(data_a[ib + 0], data_a[ib + 1]);
"""
shader_f16_dequant_func_compat = """
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]); #define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
""" """
shader_q4_0_dequant_func = """ shader_q4_0_dequant_func = """
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
const uint8_t vui = data_a[ib].qs[iqs]; \
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
v = (v - 8.0hf)*d;
"""
shader_q4_0_dequant_func_compat = """
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \
const uint vui = uint(data_a[ib].qs[iqs]); \ const uint vui = uint(data_a[ib].qs[iqs]); \
vec2 v = vec2(vui & 0xF, vui >> 4); \ vec2 v = vec2(vui & 0xF, vui >> 4); \
@ -177,13 +168,6 @@ v = (v - 8.0f)*d;
""" """
shader_q4_1_dequant_func = """ shader_q4_1_dequant_func = """
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
const float16_t m = data_a[ib].m; \
const uint8_t vui = data_a[ib].qs[iqs]; \
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
v = v*d + m;
"""
shader_q4_1_dequant_func_compat = """
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \
const float m = float(data_a[ib].m); \ const float m = float(data_a[ib].m); \
const uint vui = uint(data_a[ib].qs[iqs]); \ const uint vui = uint(data_a[ib].qs[iqs]); \
@ -192,14 +176,6 @@ v = v*d + m;
""" """
shader_q5_0_dequant_func = """ shader_q5_0_dequant_func = """
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
const uint8_t vui = data_a[ib].qs[iqs]; \
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
v = (v - 16.0hf) * d;
"""
shader_q5_0_dequant_func_compat = """
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \ const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \ const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
@ -209,14 +185,6 @@ v = (v - 16.0f) * d;
""" """
shader_q5_1_dequant_func = """ shader_q5_1_dequant_func = """
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
const float16_t m = data_a[ib].m; \
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
const uint8_t vui = data_a[ib].qs[iqs]; \
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
v = v*d + m;
"""
shader_q5_1_dequant_func_compat = """
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \
const float m = float(data_a[ib].m); \ const float m = float(data_a[ib].m); \
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \ const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
@ -226,11 +194,6 @@ v = v*d + m;
""" """
shader_q8_0_dequant_func = """ shader_q8_0_dequant_func = """
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
f16vec2 v = f16vec2(data_a[ib].qs[iqs], data_a[ib].qs[iqs + 1]); \
v = v * d;
"""
shader_q8_0_dequant_func_compat = """
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \
vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \ vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
v = v * d; v = v * d;
@ -1689,7 +1652,8 @@ void main() {
} }
const float xi = float(data_a[i]); const float xi = float(data_a[i]);
data_d[i] = D_TYPE(0.5f*xi*(1.0f + tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)))); const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
} }
""" """
@ -2103,13 +2067,15 @@ type_names = {
K_QUANTS_PER_ITERATION = 2 K_QUANTS_PER_ITERATION = 2
ASYNCIO_CONCURRENCY = 64
output_dir = gettempdir() output_dir = gettempdir()
lock = asyncio.Lock() lock = asyncio.Lock()
shader_fnames = [] shader_fnames = []
async def string_to_spv(name, code, defines, fp16): async def string_to_spv(name, code, defines, fp16=True):
f = NamedTemporaryFile(mode="w", delete=False) f = NamedTemporaryFile(mode="w", delete=False)
f.write(code) f.write(code)
f.flush() f.flush()
@ -2199,64 +2165,6 @@ async def main():
tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
# Build dequant shaders
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}, fp16))
for i in range(0, VK_NUM_TYPES):
stream.clear()
stream.extend((dequant_head, shader_int8_ext, shader_float_type))
if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, dequant_body))
elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, dequant_body))
elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body))
elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q2_K:
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
elif i == GGML_TYPE_Q3_K:
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
elif i == GGML_TYPE_Q4_K:
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
elif i == GGML_TYPE_Q5_K:
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
elif i == GGML_TYPE_Q6_K:
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
else:
continue
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}, fp16))
# get_rows
for i in range(0, VK_NUM_TYPES):
stream.clear()
stream.extend((generic_head, shader_int8_ext, shader_float_type))
if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, get_rows_body))
else:
continue
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}, fp16))
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}, fp16))
# Shaders where precision is needed, so no fp16 version # Shaders where precision is needed, so no fp16 version
# mul mat vec # mul mat vec
@ -2265,17 +2173,17 @@ async def main():
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32)) stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
if i == GGML_TYPE_F16: if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q4_0: elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q4_1: elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q5_0: elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q5_1: elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q8_0: elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat, mul_mat_vec_body)) stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q2_K: elif i == GGML_TYPE_Q2_K:
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body)) stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
elif i == GGML_TYPE_Q3_K: elif i == GGML_TYPE_Q3_K:
@ -2289,45 +2197,110 @@ async def main():
else: else:
continue continue
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}, fp16)) tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True)) # Dequant shaders
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True)) for i in range(0, VK_NUM_TYPES):
stream.clear()
stream.extend((dequant_head, shader_int8_ext, shader_f32))
if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func, dequant_body))
elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body))
elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body))
elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body))
elif i == GGML_TYPE_Q2_K:
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
elif i == GGML_TYPE_Q3_K:
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
elif i == GGML_TYPE_Q4_K:
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
elif i == GGML_TYPE_Q5_K:
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
elif i == GGML_TYPE_Q6_K:
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
else:
continue
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}))
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}))
# get_rows
for i in range(0, VK_NUM_TYPES):
stream.clear()
stream.extend((generic_head, shader_int8_ext, shader_f32))
if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
else:
continue
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
# Norms # Norms
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}, True)) tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}, True)) tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}, True)) tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
await asyncio.gather(*tasks) # Helper to decorate tasks with semaphore acquisition.
async def withSemaphore(sem, task):
async with sem:
return await task
# Run tasks concurrently guarded by a concurrency limit.
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
with open("ggml-vulkan-shaders.hpp", "w") as f: with open("ggml-vulkan-shaders.hpp", "w") as f:
f.write("#include <cstdint>\n\n") f.write("#include <cstdint>\n\n")

View File

@ -50,6 +50,7 @@ class Keys:
VALUE_LENGTH = "{arch}.attention.value_length" VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
CAUSAL = "{arch}.attention.causal"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -63,6 +64,7 @@ class Keys:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
SCORES = "tokenizer.ggml.scores" SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
@ -104,6 +106,7 @@ class MODEL_ARCH(IntEnum):
CODESHELL = auto() CODESHELL = auto()
ORION = auto() ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -121,6 +124,7 @@ class MODEL_TENSOR(IntEnum):
ATTN_OUT = auto() ATTN_OUT = auto()
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto() FFN_GATE_INP = auto()
FFN_NORM = auto() FFN_NORM = auto()
@ -133,6 +137,7 @@ class MODEL_TENSOR(IntEnum):
FFN_UP_EXP = auto() FFN_UP_EXP = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -156,6 +161,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -176,6 +182,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
@ -185,6 +192,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -260,17 +268,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
], ],
MODEL_ARCH.BERT: [ MODEL_ARCH.BERT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES, MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.POS_EMBD, MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
], ],
MODEL_ARCH.MPT: [ MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -464,6 +473,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.MINICPM: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
# TODO # TODO
} }

View File

@ -357,6 +357,9 @@ class GGUFWriter:
def add_layer_norm_rms_eps(self, value: float) -> None: def add_layer_norm_rms_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
@ -387,6 +390,9 @@ class GGUFWriter:
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
def add_token_type_count(self, value: int) -> None:
self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
def add_token_scores(self, scores: Sequence[float]) -> None: def add_token_scores(self, scores: Sequence[float]) -> None:
self.add_array(Keys.Tokenizer.SCORES, scores) self.add_array(Keys.Tokenizer.SCORES, scores)

View File

@ -30,6 +30,7 @@ class TensorNameMap:
# Normalization of token embeddings # Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert
), ),
# Position embeddings # Position embeddings
@ -54,7 +55,6 @@ class TensorNameMap:
"transformer.ln_f", # gpt2 gpt-j falcon "transformer.ln_f", # gpt2 gpt-j falcon
"model.norm", # llama-hf baichuan internlm2 "model.norm", # llama-hf baichuan internlm2
"norm", # llama-pth "norm", # llama-pth
"embeddings.LayerNorm", # bert
"transformer.norm_f", # mpt "transformer.norm_f", # mpt
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon "language_model.encoder.final_layernorm", # persimmon
@ -79,7 +79,6 @@ class TensorNameMap:
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf "model.layers.{bid}.input_layernorm", # llama-hf
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
"h.{bid}.ln_1", # gpt2 "h.{bid}.ln_1", # gpt2
@ -155,6 +154,11 @@ class TensorNameMap:
"model.layers.{bid}.attention.wo", # internlm2 "model.layers.{bid}.attention.wo", # internlm2
), ),
# Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
),
# Rotary embeddings # Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: ( MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
@ -171,7 +175,6 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf "model.layers.{bid}.post_attention_layernorm", # llama-hf
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"encoder.layer.{bid}.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
"h.{bid}.ln_2", # gpt2 "h.{bid}.ln_2", # gpt2
@ -266,6 +269,10 @@ class TensorNameMap:
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
), ),
MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert
)
} }
mapping: dict[str, tuple[MODEL_TENSOR, str]] mapping: dict[str, tuple[MODEL_TENSOR, str]]

989
llama.cpp

File diff suppressed because it is too large Load Diff

View File

@ -61,6 +61,7 @@ extern "C" {
enum llama_vocab_type { enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
}; };
enum llama_token_type { enum llama_token_type {
@ -213,7 +214,7 @@ extern "C" {
uint32_t n_batch; // prompt processing maximum batch size uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_base; // RoPE base frequency, 0 = from model

View File

@ -156,8 +156,8 @@ int main(int argc, char** argv) {
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
float fs; float fs;
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data()); if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data()); else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
t2 = std::chrono::high_resolution_clock::now(); t2 = std::chrono::high_resolution_clock::now();
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count(); t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
if (iloop > 3) ggml.addResult(fs, t); if (iloop > 3) ggml.addResult(fs, t);

View File

@ -284,8 +284,8 @@ int main(int argc, char** argv) {
else { else {
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
vdot.from_float(y1.data(), q8.data(), kVecSize); vdot.from_float(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data()); if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data()); else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
} }
sumq += result; sumq += result;
t2 = std::chrono::high_resolution_clock::now(); t2 = std::chrono::high_resolution_clock::now();

View File

@ -14,7 +14,7 @@
# - Might be unstable! # - Might be unstable!
# #
# Usage: # Usage:
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
# #
# --port: port number, default is 8888 # --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files # --repo: path to a repo containing GGUF model files
@ -24,6 +24,7 @@
# --n-parallel: number of parallel requests, default is 8 # --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096 # --n-kv: KV cache size, default is 4096
# --verbose: verbose output # --verbose: verbose output
# --non-interactive: run without asking a permission to run
# #
# Example: # Example:
# #
@ -47,6 +48,7 @@ if ! command -v make &> /dev/null; then
fi fi
# parse arguments # parse arguments
is_interactive=1
port=8888 port=8888
repo="" repo=""
wtype="" wtype=""
@ -66,7 +68,7 @@ verbose=0
function print_usage { function print_usage {
printf "Usage:\n" printf "Usage:\n"
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
printf " --port: port number, default is 8888\n" printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n" printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
@ -75,6 +77,7 @@ function print_usage {
printf " --n-parallel: number of parallel requests, default is 8\n" printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n" printf " --n-kv: KV cache size, default is 4096\n"
printf " --verbose: verbose output\n\n" printf " --verbose: verbose output\n\n"
printf " --non-interactive: run without asking a permission to run\n"
printf "Example:\n\n" printf "Example:\n\n"
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
} }
@ -82,6 +85,10 @@ function print_usage {
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
key="$1" key="$1"
case $key in case $key in
--non-interactive)
is_interactive=0
shift
;;
--port) --port)
port="$2" port="$2"
shift shift
@ -141,6 +148,28 @@ for wt in "${wtypes[@]}"; do
wfiles+=("") wfiles+=("")
done done
# map wtype input to index
if [[ ! -z "$wtype" ]]; then
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
if [[ "$uwt" == "$wtype" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
exit 1
fi
wtype="$iw"
fi
# sample repos # sample repos
repos=( repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF" "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
@ -154,31 +183,32 @@ repos=(
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF" "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
) )
if [ $is_interactive -eq 1 ]; then
printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
printf "\n" read
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" fi
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
read
if [[ -z "$repo" ]]; then if [[ -z "$repo" ]]; then
printf "[+] No repo provided from the command line\n" printf "[+] No repo provided from the command line\n"
@ -252,8 +282,10 @@ for file in $model_files; do
printf " %2d) %s %s\n" $iw "$have" "$file" printf " %2d) %s %s\n" $iw "$have" "$file"
done done
wfile="${wfiles[$wtype]}"
# ask for weights type until provided and available # ask for weights type until provided and available
while [[ -z "$wtype" ]]; do while [[ -z "$wfile" ]]; do
printf "\n" printf "\n"
read -p "[+] Select weight type: " wtype read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}" wfile="${wfiles[$wtype]}"

View File

@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.cu -> ggml-cuda.cu
# src/ggml-cuda.h -> ggml-cuda.h # src/ggml-cuda.h -> ggml-cuda.h
# src/ggml-impl.h -> ggml-impl.h # src/ggml-impl.h -> ggml-impl.h
# src/ggml-kompute.cpp -> ggml-kompute.cpp
# src/ggml-kompute.h -> ggml-kompute.h
# src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.h -> ggml-metal.h
# src/ggml-metal.m -> ggml-metal.m # src/ggml-metal.m -> ggml-metal.m
# src/ggml-mpi.h -> ggml-mpi.h # src/ggml-mpi.h -> ggml-mpi.h
@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-opencl.h -> ggml-opencl.h # src/ggml-opencl.h -> ggml-opencl.h
# src/ggml-quants.c -> ggml-quants.c # src/ggml-quants.c -> ggml-quants.c
# src/ggml-quants.h -> ggml-quants.h # src/ggml-quants.h -> ggml-quants.h
# src/ggml-sycl.cpp -> ggml-sycl.cpp
# src/ggml-sycl.h -> ggml-sycl.h
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
# src/ggml-vulkan.h -> ggml-vulkan.h
# include/ggml/ggml.h -> ggml.h # include/ggml/ggml.h -> ggml.h
# include/ggml/ggml-alloc.h -> ggml-alloc.h # include/ggml/ggml-alloc.h -> ggml-alloc.h
# include/ggml/ggml-backend.h -> ggml-backend.h # include/ggml/ggml-backend.h -> ggml-backend.h
@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
-e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \ -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \

View File

@ -1 +1 @@
475cbad5c1c834e31e26a2283bc1413181644360 5070f078a67c18c11736e78316ab715ca9afde16

View File

@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
cp -rpv ../ggml/src/ggml-vulkan.h ./ggml-vulkan.h
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h

1
spm-headers/ggml-alloc.h Symbolic link
View File

@ -0,0 +1 @@
../ggml-alloc.h

1
spm-headers/ggml-backend.h Symbolic link
View File

@ -0,0 +1 @@
../ggml-backend.h

1
spm-headers/ggml.h Symbolic link
View File

@ -0,0 +1 @@
../ggml.h

2
tests/.gitignore vendored
View File

@ -1,3 +1,3 @@
* *
!*.* !*.*
test-c.o *.o

View File

@ -105,7 +105,7 @@ int main()
for (auto rule : expected_rules) for (auto rule : expected_rules)
{ {
parsed_grammar.rules.push_back({}); parsed_grammar.rules.emplace_back();
for (auto element : rule) for (auto element : rule)
{ {
parsed_grammar.rules.back().push_back(element); parsed_grammar.rules.back().push_back(element);

View File

@ -87,7 +87,7 @@ static float dot_product_error(
vdot.from_float(test_data2, tmp_q2.data(), test_size); vdot.from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY; float result = INFINITY;
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data()); qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
const float dot_ref = dot_product(test_data1, test_data2, test_size); const float dot_ref = dot_product(test_data1, test_data2, test_size);

View File

@ -346,7 +346,7 @@ int main(int argc, char * argv[]) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float { auto quantize_fn = [&](void) -> float {
float result; float result;
qfns.vec_dot(size, &result, test_q1, test_q2); qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
return result; return result;
}; };
size_t quantized_size = ggml_row_size(type, size); size_t quantized_size = ggml_row_size(type, size);

View File

@ -235,6 +235,8 @@ int main(void) {
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);