diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile new file mode 100644 index 000000000..e1e6acc24 --- /dev/null +++ b/.devops/main-intel.Dockerfile @@ -0,0 +1,26 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 +ARG UBUNTU_VERSION=22.04 + +FROM intel/hpckit:$ONEAPI_VERSION as build + +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance +RUN mkdir build && \ + cd build && \ + cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ + cmake --build . --config Release --target main server + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/build/bin/main /main +COPY --from=build /app/build/bin/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/main" ] diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix index 6e9872b28..4a2f81c4b 100644 --- a/.devops/nix/nixpkgs-instances.nix +++ b/.devops/nix/nixpkgs-instances.nix @@ -7,6 +7,18 @@ { system, ... }: { _module.args = { + # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs + # again, the below creates several nixpkgs instances which the + # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`. + # + # This is currently "slow" and "expensive", on a certain scale. + # This also isn't "right" in that this hinders dependency injection at + # the level of flake inputs. This might get removed in the foreseeable + # future. + # + # Note that you can use these expressions without Nix + # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point). + pkgsCuda = import inputs.nixpkgs { inherit system; # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 43bdbd755..a868a9a61 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -73,6 +73,7 @@ let ps: [ ps.numpy ps.sentencepiece + ps.tiktoken ps.torchWithoutCuda ps.transformers ] @@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation ( pname = "llama-cpp${pnameSuffix}"; version = llamaVersion; + # Note: none of the files discarded here are visible in the sandbox or + # affect the output hash. This also means they can be modified without + # triggering a rebuild. src = lib.cleanSourceWith { filter = name: type: - !(builtins.any (_: _) [ + let + noneOf = builtins.all (x: !x); + baseName = baseNameOf name; + in + noneOf [ (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths - (name == "README.md") # Ignore *.md changes whe computing outPaths - (lib.hasPrefix "." name) # Skip hidden files and directories - ]); + (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths + (lib.hasPrefix "." baseName) # Skip hidden files and directories + (baseName == "flake.lock") + ]; src = lib.cleanSource ../../.; }; @@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation ( cmakeFlags = [ - (cmakeBool "LLAMA_NATIVE" true) + (cmakeBool "LLAMA_NATIVE" false) (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) @@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation ( description = "contains numpy and sentencepiece"; buildInputs = [ llama-python ]; inputsFrom = [ finalAttrs.finalPackage ]; + shellHook = '' + addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib" + ''; }; shell-extra = mkShell { diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index 7932ac1e8..d295995a4 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -4,6 +4,10 @@ llamaVersion ? "0.0.0", }: +# We're using `makeScope` instead of just writing out an attrset +# because it allows users to apply overlays later using `overrideScope'`. +# Cf. https://noogle.dev/f/lib/makeScope + lib.makeScope newScope ( self: { inherit llamaVersion; diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile new file mode 100644 index 000000000..4f83904bc --- /dev/null +++ b/.devops/server-cuda.Dockerfile @@ -0,0 +1,32 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable cuBLAS +ENV LLAMA_CUBLAS=1 + +RUN make + +FROM ${BASE_CUDA_RUN_CONTAINER} as runtime + +COPY --from=build /app/server /server + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile new file mode 100644 index 000000000..e343d278c --- /dev/null +++ b/.devops/server-intel.Dockerfile @@ -0,0 +1,25 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 +ARG UBUNTU_VERSION=22.04 + +FROM intel/hpckit:$ONEAPI_VERSION as build + +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance +RUN mkdir build && \ + cd build && \ + cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ + cmake --build . --config Release --target main server + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/build/bin/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile new file mode 100644 index 000000000..e9a31647c --- /dev/null +++ b/.devops/server-rocm.Dockerfile @@ -0,0 +1,45 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT [ "/app/server" ] diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile new file mode 100644 index 000000000..134588fe2 --- /dev/null +++ b/.devops/server.Dockerfile @@ -0,0 +1,20 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 367df07a7..d22a041a6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -107,7 +107,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -141,7 +141,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest -L main --verbose # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. @@ -202,7 +202,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -295,7 +295,7 @@ jobs: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 CLBLAST_VERSION: 1.6.0 - SDE_VERSION: 9.21.1-2023-04-24 + SDE_VERSION: 9.33.0-2024-01-07 strategy: matrix: @@ -394,19 +394,19 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 run: | cd build - ctest -C Release --verbose --timeout 900 + ctest -L main -C Release --verbose --timeout 900 - name: Test (Intel SDE) id: cmake_test_sde if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation run: | - curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz" + curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" # for some weird reason windows tar doesn't like sde tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - & $sde -future -- ctest -C Release --verbose --timeout 900 + & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name id: tag diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 87904b75e..94f9161fc 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -28,13 +28,18 @@ jobs: config: - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" } # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # have disabled them for now until the reason why # is understood. - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v3 diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index be7c26d40..0c6cf5f09 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -2,13 +2,20 @@ name: Nix aarch64 builds on: workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because QEMU is expensive (e.g. + # 1.5h instead of minutes with the cold cache). + # + # randint(0, 59), randint(0, 23) + - cron: '26 12 * * *' + # But also rebuild if we touched any of the Nix expressions: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] + paths: ['**/*.nix', 'flake.lock'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] + paths: ['**/*.nix', 'flake.lock'] jobs: nix-build-aarch64: diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 845b93bfb..d19c7a576 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -5,10 +5,8 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] jobs: nix-eval: diff --git a/.gitignore b/.gitignore index 5ab81445d..cb0069bfb 100644 --- a/.gitignore +++ b/.gitignore @@ -27,7 +27,7 @@ lcov-report/ gcovr-report/ -build*/ +build* out/ tmp/ @@ -89,20 +89,3 @@ examples/jeopardy/results.txt poetry.lock poetry.toml - -# Test binaries -/tests/test-grammar-parser -/tests/test-llama-grammar -/tests/test-double-float -/tests/test-grad0 -/tests/test-opt -/tests/test-quantize-fns -/tests/test-quantize-perf -/tests/test-sampling -/tests/test-tokenizer-0-llama -/tests/test-tokenizer-0-falcon -/tests/test-tokenizer-1-llama -/tests/test-tokenizer-1-bpe -/tests/test-rope -/tests/test-backend-ops -/tests/test-autorelease diff --git a/CMakeLists.txt b/CMakeLists.txt index 3fc65eaf2..2b2ae532e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS "build shared libraries" option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_LTO "llama: enable link time optimization" OFF) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) @@ -107,6 +108,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) + +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) +if (LLAMA_PERF) + add_definitions(-DGGML_PERF) +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) @@ -458,18 +466,23 @@ function(get_flags CCID CCVER) (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) ) - set(C_FLAGS ${C_FLAGS} -Wdouble-promotion) + list(APPEND C_FLAGS -Wdouble-promotion) endif() elseif (CCID STREQUAL "GNU") set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) if (CCVER VERSION_GREATER_EQUAL 7.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation) + list(APPEND CXX_FLAGS -Wno-format-truncation) endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi) + list(APPEND CXX_FLAGS -Wextra-semi) endif() + elseif (CCID MATCHES "Intel") + # enable max optimization level when using Intel compiler + set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) + set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) + add_link_options(-fuse-ld=lld -static-intel) endif() set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) @@ -497,16 +510,18 @@ if (LLAMA_ALL_WARNINGS) endif() endif() +set(CUDA_CXX_FLAGS "") + if (LLAMA_CUBLAS) set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math) if (NOT MSVC) - set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic) + list(APPEND CUDA_FLAGS -Wno-pedantic) endif() if (LLAMA_ALL_WARNINGS AND NOT MSVC) set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) endif() execute_process( @@ -534,13 +549,8 @@ if (LLAMA_CUBLAS) message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS STREQUAL "") - set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS}) - endif() + list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later endif() - - add_compile_options("$<$:${CUDA_FLAGS}>") endif() if (WIN32) @@ -561,6 +571,17 @@ if (LLAMA_LTO) endif() endif() +if (LLAMA_CCACHE) + find_program(LLAMA_CCACHE_FOUND ccache) + if (LLAMA_CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "Using ccache") + else() + message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF") + endif () +endif() + # this version of Apple ld64 is buggy execute_process( COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v @@ -594,12 +615,7 @@ if (NOT MSVC) endif() endif() -function(add_compile_option_cpp ARG) - # Adds a compile option to C/C++ only, but not for Cuda. - # Use, e.g., for CPU-architecture flags. - add_compile_options($<$:${ARG}>) - add_compile_options($<$:${ARG}>) -endfunction() +set(ARCH_FLAGS "") if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) message(STATUS "ARM detected") @@ -612,19 +628,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC else() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - add_compile_options(-mfp16-format=ieee) + list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") # Raspberry Pi 1, Zero - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") # Raspberry Pi 2 - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") # Raspberry Pi 3, 4, Zero 2 (32-bit) - add_compile_options(-mno-unaligned-access) + list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) @@ -635,7 +651,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE include(cmake/FindSIMD.cmake) endif () if (LLAMA_AVX512) - add_compile_option_cpp(/arch:AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -649,49 +665,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_option_cpp(/arch:AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) elseif (LLAMA_AVX) - add_compile_option_cpp(/arch:AVX) + list(APPEND ARCH_FLAGS /arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_option_cpp(-march=native) + list(APPEND ARCH_FLAGS -march=native) endif() if (LLAMA_F16C) - add_compile_option_cpp(-mf16c) + list(APPEND ARCH_FLAGS -mf16c) endif() if (LLAMA_FMA) - add_compile_option_cpp(-mfma) + list(APPEND ARCH_FLAGS -mfma) endif() if (LLAMA_AVX) - add_compile_option_cpp(-mavx) + list(APPEND ARCH_FLAGS -mavx) endif() if (LLAMA_AVX2) - add_compile_option_cpp(-mavx2) + list(APPEND ARCH_FLAGS -mavx2) endif() if (LLAMA_AVX512) - add_compile_option_cpp(-mavx512f) - add_compile_option_cpp(-mavx512bw) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_option_cpp(-mavx512vbmi) + list(APPEND ARCH_FLAGS -mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_option_cpp(-mavx512vnni) + list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - add_compile_options(-mcpu=powerpc64le) + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() - add_compile_options(-mcpu=native -mtune=native) + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) endif() else() message(STATUS "Unknown architecture") endif() +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (LLAMA_CUBLAS) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + if (MINGW) # Target Windows 8 for PrefetchVirtualMemory add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) diff --git a/Makefile b/Makefile index a8658a596..b8858b412 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops tests/test-autorelease + tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -619,7 +619,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) @@ -748,5 +748,8 @@ tests/test-c.o: tests/test-c.c llama.h tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index 866aa87b4..44898d2f2 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow - Collecting Apple Silicon performance stats: - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 -- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 ---- @@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) +- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) **Bindings:** @@ -121,13 +122,15 @@ as the main playground for developing new features for the [ggml](https://github - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) -- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) +- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) **UI:** @@ -929,17 +932,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th * Create a folder to store big models & intermediate files (ex. /llama/models) #### Images -We have two Docker images available for this project: +We have three Docker images available for this project: 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). @@ -965,6 +971,12 @@ or with a light image: docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` +or with a server image: + +```bash +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +``` + ### Docker With CUDA Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. @@ -974,6 +986,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ```bash docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . +docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. @@ -987,6 +1000,7 @@ The resulting images, are essentially the same as the non-CUDA images: 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. +3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. #### Usage @@ -995,6 +1009,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne ```bash docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ### Contributing diff --git a/ci/run.sh b/ci/run.sh index 791b17a19..2427e55a2 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -22,9 +22,9 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -v $OUT/*.log -rm -v $OUT/*.exit -rm -v $OUT/*.md +rm -f "$OUT/*.log" +rm -f "$OUT/*.exit" +rm -f "$OUT/*.md" sd=`dirname $0` cd $sd/../ @@ -94,7 +94,7 @@ function gg_run_ctest_debug { (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -123,9 +123,9 @@ function gg_run_ctest_release { (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then - (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e @@ -141,6 +141,61 @@ function gg_sum_ctest_release { gg_printf '```\n' } +function gg_get_model { + local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" + local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" + if [[ -s $gguf_3b ]]; then + echo -n "$gguf_3b" + elif [[ -s $gguf_7b ]]; then + echo -n "$gguf_7b" + else + echo >&2 "No model found. Can't run gg_run_ctest_with_model." + exit 1 + fi +} + +function gg_run_ctest_with_model_debug { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-debug + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_run_ctest_with_model_release { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-release + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_sum_ctest_with_model_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +function gg_sum_ctest_with_model_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { @@ -183,8 +238,6 @@ function gg_run_open_llama_3b_v2 { wiki_test_60="${path_wiki}/wiki.test-60.raw" - ./bin/test-autorelease ${model_f16} - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/quantize ${model_f16} ${model_q4_0} q4_0 ./bin/quantize ${model_f16} ${model_q4_1} q4_1 @@ -507,14 +560,18 @@ function gg_sum_open_llama_7b_v2 { ## main if [ -z ${GG_BUILD_LOW_PERF} ]; then + # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt - mnt_models=${MNT}/models mkdir -p ${mnt_models} ln -sfn ${mnt_models} ${SRC}/models-mnt - python3 -m pip install -r ${SRC}/requirements.txt - python3 -m pip install --editable gguf-py + # Create a fresh python3 venv and enter it + python3 -m venv "$MNT/venv" + source "$MNT/venv/bin/activate" + + pip install -r ${SRC}/requirements.txt --disable-pip-version-check + pip install --editable gguf-py --disable-pip-version-check fi ret=0 @@ -529,6 +586,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then else test $ret -eq 0 && gg_run open_llama_7b_v2 fi + test $ret -eq 0 && gg_run ctest_with_model_debug + test $ret -eq 0 && gg_run ctest_with_model_release fi fi diff --git a/common/common.cpp b/common/common.cpp index ce20360a4..6b07f1197 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -203,6 +203,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.prompt_cache_all = true; } else if (arg == "--prompt-cache-ro") { params.prompt_cache_ro = true; + } else if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -653,6 +670,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } + } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.logits_file = argv[i]; } else if (arg == "--perplexity" || arg == "--all-logits") { params.logits_all = true; } else if (arg == "--ppl-stride") { @@ -689,6 +712,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.winogrande_tasks = std::stoi(argv[i]); + } else if (arg == "--multiple-choice") { + params.multiple_choice = true; + } else if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + } else if (arg == "--kl-divergence") { + params.kl_divergence = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -888,6 +921,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); + printf(" -bf FNAME, --binary-file FNAME\n"); + printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -936,6 +971,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); + printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); + printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); + printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base"); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); diff --git a/common/common.h b/common/common.h index 0ae9c18b3..214a379b5 100644 --- a/common/common.h +++ b/common/common.h @@ -91,6 +91,7 @@ struct gpt_params { std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files + std::string logits_file = ""; // file for saving *all* logits std::vector kv_overrides; @@ -108,6 +109,11 @@ struct gpt_params { bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + + bool kl_divergence = false; // compute KL-divergence + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/common/sampling.cpp b/common/sampling.cpp index dd1ffeb1b..e8675a8c0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ // will be empty (default) if there are parse errors if (result->parsed_grammar.rules.empty()) { fprintf(stderr, "%s: failed to parse grammar\n", __func__); + delete result; return nullptr; } @@ -129,6 +130,8 @@ static void sampler_queue( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; @@ -143,7 +146,15 @@ static void sampler_queue( case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + case 't': + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; default : break; } } diff --git a/common/sampling.h b/common/sampling.h index 2ee180376..88899c094 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -18,6 +18,8 @@ typedef struct llama_sampling_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5cb3e63fb..6ab7f486e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -10,7 +10,7 @@ import re import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast import numpy as np import torch @@ -201,6 +201,8 @@ class Model: return PlamoModel if model_architecture == "CodeShellForCausalLM": return CodeShellModel + if model_architecture == "OrionForCausalLM": + return OrionModel return Model def _is_model_safetensors(self) -> bool: @@ -250,6 +252,8 @@ class Model: return gguf.MODEL_ARCH.PLAMO if arch == "CodeShellForCausalLM": return gguf.MODEL_ARCH.CODESHELL + if arch == "OrionForCausalLM": + return gguf.MODEL_ARCH.ORION raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -289,6 +293,58 @@ class Model: special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode("utf-8") + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor @@ -487,7 +543,8 @@ class MPTModel(Model): # map tensor names if "scales" in name: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") + if new_name is not None: + new_name = new_name.replace("scales", "act.scales") else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: @@ -519,6 +576,83 @@ class MPTModel(Model): self.gguf_writer.add_tensor("output.weight", data) +class OrionModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + class BaichuanModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() @@ -876,6 +1010,13 @@ class PersimmonModel(Model): class StableLMModel(Model): + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -904,7 +1045,7 @@ class QwenModel(Model): return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -921,52 +1062,7 @@ class QwenModel(Model): return parts def set_vocab(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[self.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(self.token_bytes_to_string, merged))) - - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.special_tokens - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_qwen() def set_gguf_parameters(self): self.gguf_writer.add_name("Qwen") @@ -1285,7 +1381,7 @@ def main() -> None: if args.awq_path: sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index e359330af..b33108062 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse +import os import struct import sys from enum import IntEnum @@ -9,7 +10,6 @@ from pathlib import Path import numpy as np -import os if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -371,15 +371,11 @@ def handle_metadata(cfg, hp): params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) else: raise ValueError('Unable to load metadata') - vocab = convert.load_vocab( - cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype) - # FIXME: Respect cfg.vocab_dir? - svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) + vocab_factory = convert.VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab, svocab) + return params, vocab, special_vocab def handle_args(): diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 35ce152f4..9a9936dec 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -5,17 +5,16 @@ import json import os import struct import sys +from pathlib import Path from typing import Any, BinaryIO, Sequence import numpy as np import torch -from pathlib import Path if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf - NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} @@ -60,7 +59,14 @@ if __name__ == '__main__': input_model = os.path.join(sys.argv[1], "adapter_model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") - model = torch.load(input_model, map_location="cpu") + if os.path.exists(input_model): + model = torch.load(input_model, map_location="cpu") + else: + input_model = os.path.join(sys.argv[1], "adapter_model.safetensors") + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + model = load_file(input_model, device="cpu") + arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" if arch_name not in gguf.MODEL_ARCH_NAMES.values(): diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 1ba5864dc..d2be805d1 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -import torch -import os -from pprint import pprint -import sys import argparse +import os +import sys from pathlib import Path +from pprint import pprint + +import torch from sentencepiece import SentencePieceProcessor + if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -69,7 +71,7 @@ def main(): persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] pprint(hparams) - tensors = {} + tensors: dict[str, torch.Tensor] = {} _flatten_dict(persimmon_model['model'], tensors, None) arch = gguf.MODEL_ARCH.PERSIMMON diff --git a/convert.py b/convert.py index 980e6fc72..06768033d 100755 --- a/convert.py +++ b/convert.py @@ -17,58 +17,28 @@ import signal import struct import sys import time -import warnings import zipfile from abc import ABCMeta, abstractmethod -from argparse import ArgumentParser from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import ( - IO, - TYPE_CHECKING, - Any, - Callable, - Iterable, - Literal, - Optional, - Tuple, - TypeVar, -) +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar import numpy as np from sentencepiece import SentencePieceProcessor -try: - from transformers import AutoTokenizer -except ModuleNotFoundError as e: - warnings.warn(f"Could not import AutoTokenizer from transformers: {e}") +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf -# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory -if "NO_LOCAL_GGUF" not in os.environ: - # Use absolute path to the gguf-py directory - gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py") - print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed - if gguf_py_dir not in sys.path: - sys.path.insert(1, gguf_py_dir) +if TYPE_CHECKING: + from typing import TypeAlias -# Import gguf module -try: - import gguf -except ModuleNotFoundError as e: - print(f"Could not import gguf: {e}") - sys.exit(1) - -if TYPE_CHECKING: # NOTE: This isn't necessary. - from typing import TypeAlias # This can technically be omitted. - -if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"): +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) -# NOTE: n-dimensional arrays should be directly referenced -NDArray: TypeAlias = "np.ndarray[Any, Any]" +NDArray: TypeAlias = 'np.ndarray[Any, Any]' -# Why is this here? LLAMA and GPT are technically the only compatible ARCHs. ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 @@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8 # -# TODO: Clean up and refactor data types @dataclass(frozen=True) class DataType: name: str @@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { @dataclass class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - f_norm_eps: Optional[float] = None - n_experts: Optional[int] = None - n_experts_used: Optional[int] = None + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None - rope_scaling_type: Optional[gguf.RopeScalingType] = None - f_rope_freq_base: Optional[float] = None - f_rope_scale: Optional[float] = None - n_orig_ctx: Optional[int] = None - rope_finetuned: Optional[bool] = None + rope_scaling_type: gguf.RopeScalingType | None = None + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None - ftype: Optional[GGMLFileType] = None + ftype: GGMLFileType | None = None # path to the directory containing the model files - path_model: Optional[Path] = None + path_model: Path | None = None @staticmethod - def guessed(model: LazyModel) -> "Params": + def guessed(model: LazyModel) -> Params: # try transformer naming first - n_vocab, n_embd = ( - model["model.embed_tokens.weight"].shape - if "model.embed_tokens.weight" in model - else model["tok_embeddings.weight"].shape - ) + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.q_proj.weight" not in model - ) - elif ( - "model.layers.0.self_attn.W_pack.weight" in model - ): # next: try baichuan naming - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.W_pack.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer = next( - i - for i in itertools.count() - if f"layers.{i}.attention.wq.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: - raise Exception( - "failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - n_head = n_embd // 128 # guessed - n_mult = 256 # guessed + n_head = n_embd // 128 # guessed + n_mult = 256 # guessed # TODO: verify this n_ff = int(2 * (4 * n_embd) / 3) n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_layer=n_layer, - n_ctx=-1, - n_ff=n_ff, - n_head=n_head, - n_head_kv=n_head, - f_norm_eps=1e-5, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = -1, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head, + f_norm_eps = 1e-5, ) @staticmethod - def load_transformers_config(model: LazyModel, config_path: Path) -> "Params": + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None @@ -274,22 +223,20 @@ class Params: rope_scaling_type = gguf.RopeScalingType.LINEAR elif typ == "yarn": rope_scaling_type = gguf.RopeScalingType.YARN - n_orig_ctx = rope_scaling["original_max_position_embeddings"] - rope_finetuned = rope_scaling["finetuned"] + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] else: - raise NotImplementedError(f"Unknown rope scaling type: {typ}") + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] elif "max_position_embeddings" in config: n_ctx = config["max_position_embeddings"] else: - raise Exception( - "failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - n_experts = None + n_experts = None n_experts_used = None if "num_local_experts" in config: @@ -297,30 +244,30 @@ class Params: n_experts_used = config["num_experts_per_tok"] return Params( - n_vocab=config["vocab_size"], - n_embd=config["hidden_size"], - n_layer=config["num_hidden_layers"], - n_ctx=n_ctx, - n_ff=config["intermediate_size"], - n_head=(n_head := config["num_attention_heads"]), - n_head_kv=config.get("num_key_value_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["rms_norm_eps"], - f_rope_freq_base=config.get("rope_theta"), - rope_scaling_type=rope_scaling_type, - f_rope_scale=f_rope_scale, - n_orig_ctx=n_orig_ctx, - rope_finetuned=rope_finetuned, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod - def load_torch_params(model: LazyModel, config_path: Path) -> "Params": + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_experts = None + n_experts = None n_experts_used = None f_rope_freq_base = None @@ -343,50 +290,50 @@ class Params: if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] + n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 return Params( - n_vocab=model["tok_embeddings.weight"].shape[0], - n_embd=config["dim"], - n_layer=config["n_layers"], - n_ctx=n_ctx, - n_ff=n_ff, - n_head=(n_head := config["n_heads"]), - n_head_kv=config.get("n_kv_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["norm_eps"], - f_rope_freq_base=config.get("rope_theta", f_rope_freq_base), + n_vocab = model["tok_embeddings.weight"].shape[0], + n_embd = config["dim"], + n_layer = config["n_layers"], + n_ctx = n_ctx, + n_ff = n_ff, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), ) @staticmethod - def load(model_plus: ModelPlus) -> "Params": - hf_config_path = model_plus.paths[0].parent / "config.json" + def load(model_plus: ModelPlus) -> Params: + hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" if hf_config_path.exists(): - params = Params.load_transformers_config(model_plus.model, hf_config_path) + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): - params = Params.load_torch_params(model_plus.model, orig_config_path) - elif model_plus.format != "none": + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) + elif model_plus.format != 'none': params = Params.guessed(model_plus.model) else: - raise ValueError("Cannot guess params when model format is none") + raise ValueError('Cannot guess params when model format is none') params.path_model = model_plus.paths[0].parent return params -class BpeVocab: # GPT - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: - self.bpe_tokenizer = json.loads( - open(str(fname_tokenizer), encoding="utf-8").read() - ) +# +# vocab +# + +class BpeVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) self.vocab = self.bpe_tokenizer["model"]["vocab"] added_tokens: dict[str, int] if fname_added_tokens is not None: @@ -394,34 +341,31 @@ class BpeVocab: # GPT added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json" + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' if not tokenizer_json_file.is_file(): added_tokens = {} else: tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) added_tokens = dict( - (item["content"], item["id"]) - for item in tokenizer_json.get("added_tokens", []) + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item["content"] not in self.bpe_tokenizer - ) + if item['content'] not in self.bpe_tokenizer) vocab_size: int = len(self.vocab) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception( - f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}" - ) + raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} @@ -442,10 +386,8 @@ class BpeVocab: # GPT return f"" -class SentencePieceVocab: # LlaMa - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: dict[str, int] if fname_added_tokens is not None: @@ -455,23 +397,19 @@ class SentencePieceVocab: # LlaMa vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - new_tokens = { - id: piece for piece, id in added_tokens.items() if id >= vocab_size - } + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError( - f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" - ) + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -512,11 +450,15 @@ class SentencePieceVocab: # LlaMa class HfVocab: - def __init__( - self, - fname_tokenizer: Path, - fname_added_tokens: Optional[Path] = None, - ) -> None: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use HfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + print("fname_tokenizer:", fname_tokenizer) # Allow the tokenizer to default to slow or fast versions. # Explicitly set tokenizer to use local paths. @@ -529,7 +471,7 @@ class HfVocab: # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -550,12 +492,12 @@ class HfVocab: # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() } @@ -573,11 +515,9 @@ class HfVocab: token_id, self.special_ids # Reuse already stored special IDs ) - def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: + def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: # Determine token type based on whether it's a special token - return ( - gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - ) + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -589,7 +529,6 @@ class HfVocab: if text in self.specials: toktype = self.get_token_type(self.specials[text], self.special_ids) score = self.get_token_score(self.specials[text]) - else: toktype = gguf.TokenType.USER_DEFINED score = -1000.0 @@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: else: model = merge_sharded([mp.model for mp in models_plus]) - return ModelPlus(model, paths, format, vocab) + return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: @@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler): CLASSES: dict[tuple[str, str], Any] = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. - ("torch._tensor", "_rebuild_from_type_v2"): getattr( - rebuild_from_type_v2, "__func__" - ), - ("torch._utils", "_rebuild_tensor_v2"): getattr( - lazy_rebuild_tensor_v2, "__func__" - ), - ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), - ("torch", "HalfStorage"): LazyStorageKind(DT_F16), - ("torch", "FloatStorage"): LazyStorageKind(DT_F32), - ("torch", "IntStorage"): LazyStorageKind(DT_I32), - ("torch", "Tensor"): LazyTensor, + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'Tensor'): LazyTensor, } def find_class(self, module: str, name: str) -> Any: @@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc executor_class = ProcessPoolExecutor else: executor_class = ThreadPoolExecutor - with executor_class(max_workers = max_workers) as executor: + with executor_class(max_workers=max_workers) as executor: futures: list[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): @@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N class OutputFile: - def __init__( - self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE - ) -> None: - self.gguf = gguf.GGUFWriter( - fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess - ) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1036,21 +967,16 @@ class OutputFile: if params.n_ctx == 4096: name = "LLaMA v2" elif params.path_model is not None: - name = str(params.path_model.parent).split("/")[-1] + name = str(params.path_model.parent).split('/')[-1] - self.gguf.add_name(name) - self.gguf.add_context_length(params.n_ctx) - self.gguf.add_embedding_length(params.n_embd) - self.gguf.add_block_count(params.n_layer) - self.gguf.add_feed_forward_length(params.n_ff) + self.gguf.add_name (name) + self.gguf.add_context_length (params.n_ctx) + self.gguf.add_embedding_length (params.n_embd) + self.gguf.add_block_count (params.n_layer) + self.gguf.add_feed_forward_length (params.n_ff) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) - self.gguf.add_head_count(params.n_head) - self.gguf.add_head_count_kv(params.n_head_kv) - - if params.f_norm_eps is None: - raise ValueError("f_norm_eps is None") - - self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + self.gguf.add_head_count (params.n_head) + self.gguf.add_head_count_kv (params.n_head_kv) if params.n_experts: self.gguf.add_expert_count(params.n_experts) @@ -1058,6 +984,11 @@ class OutputFile: if params.n_experts_used: self.gguf.add_expert_used_count(params.n_experts_used) + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') + if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) @@ -1089,7 +1020,7 @@ class OutputFile: return tokenizer_model - def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]: + def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: tokens = [] scores = [] toktypes = [] @@ -1124,14 +1055,10 @@ class OutputFile: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, "ggml_type", None) - data_type = ( - getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype - ) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info( - name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype - ) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -1145,14 +1072,10 @@ class OutputFile: @staticmethod def write_vocab_only( - fname_out: Path, - params: Params, - vocab: Vocab, - svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1180,14 +1103,8 @@ class OutputFile: @staticmethod def write_all( - fname_out: Path, - ftype: GGMLFileType, - params: Params, - model: LazyModel, - vocab: Vocab, - svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1207,26 +1124,19 @@ class OutputFile: of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map( - OutputFile.do_item, model.items(), concurrency=concurrency - ) + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) if ftype == GGMLFileType.MostlyQ8_0: ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, - ndarrays_inner, - concurrency=concurrency, - max_workers=concurrency, + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, use_processpool_executor=True, ) else: ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate( - zip(model.items(), ndarrays) - ): + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): elapsed = time.time() - start - size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape) + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) print( f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" @@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: def __init__(self, path: Path): self.path = path - self.files = { + self.files: dict[str, Path | None] = { "tokenizer.model": None, "vocab.json": None, "tokenizer.json": None, @@ -1380,24 +1290,18 @@ class VocabFactory: self.files[file] = parent_file_path print(f"Found vocab files: {self.files}") - def _select_file(self, vocabtype: Optional[str]) -> Path: + def _select_file(self, vocabtype: str | None) -> Path: if vocabtype in ["spm", "bpe"]: for file_key in self.files.keys(): - if self.files[file_key]: - return self.files[file_key] + if (file := self.files[file_key]) is not None: + return file raise FileNotFoundError(f"{vocabtype} vocab not found.") - elif vocabtype == "hfft": + if vocabtype == "hfft": # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file return self.path - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + raise ValueError(f"Unsupported vocabulary type {vocabtype}") - def _create_special_vocab( - self, - vocab: Vocab, - vocabtype: str, - model_parent_path: Path, - ) -> gguf.SpecialVocab: + def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: load_merges = vocabtype == "bpe" n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None return gguf.SpecialVocab( @@ -1407,13 +1311,12 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab( - self, vocabtype: str, model_parent_path: Path - ) -> Tuple[Vocab, gguf.SpecialVocab]: + def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: path = self._select_file(vocabtype) print(f"Loading vocab file '{path}', type '{vocabtype}'") added_tokens_path = path.parent / "added_tokens.json" + vocab: Vocab if vocabtype == "bpe": vocab = BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None @@ -1428,6 +1331,7 @@ class VocabFactory: ) else: raise ValueError(f"Unsupported vocabulary type {vocabtype}") + # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, vocabtype, @@ -1436,18 +1340,17 @@ class VocabFactory: return vocab, special_vocab -def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path: +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { - GGMLFileType.AllF32: "f32", + GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ8_0: "q8_0", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: sys.stderr.write( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n" - ) + "Please explicitly specify a path using --outfile.\n") sys.exit(1) return ret @@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") for name, lazy_tensor in model_plus.model.items(): - print( - f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}" - ) + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") -def get_argument_parser() -> ArgumentParser: +def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] if np.uint32(1) == np.uint32(1).newbyteorder("<"): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") + vocab_types = ["spm", "bpe", "hfft"] + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") - parser = argparse.ArgumentParser( - description="Convert a LLaMa model to a GGML compatible file" - ) - - parser.add_argument( - "model", - type=Path, - help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)", - ) - - parser.add_argument( - "--awq-path", - type=Path, - help="Path to the Activation-aware Weight Quantization cache file", - default=None, - ) - - parser.add_argument( - "--dump", - action="store_true", - help="Display the model content without converting it", - ) - - parser.add_argument( - "--dump-single", - action="store_true", - help="Display the content of a single model file without conversion", - ) - - parser.add_argument( - "--vocab-only", - action="store_true", - help="Extract and output only the vocabulary", - ) - - parser.add_argument( - "--outtype", - choices=output_choices, - help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)", - ) - - parser.add_argument( - "--vocab-dir", - type=Path, - help="Directory containing the tokenizer.model, if separate from the model file", - ) - - parser.add_argument( - "--vocab-type", - choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer - default="spm", - help="The vocabulary format used to define the tokenizer model (default: spm)", - ) - - parser.add_argument( - "--pad-vocab", - action="store_true", - help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata", - ) - - parser.add_argument( - "--outfile", - type=Path, - help="Specify the path for the output file (default is based on input)", - ) - - parser.add_argument( - "--ctx", type=int, help="Model training context (default is based on input)" - ) - - parser.add_argument( - "--concurrency", - type=int, - help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", - default=DEFAULT_CONCURRENCY, - ) - - parser.add_argument( - "--big-endian", - action="store_true", - help="Indicate that the model is executed on a big-endian machine", - ) - - return parser - - -def main(argv: Optional[list[str]] = None) -> None: - parser = get_argument_parser() - args = parser.parse_args(argv) - + args = parser.parse_args(args_in) if args.awq_path: - sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py")) - from awq.apply_awq import add_scale_weights - + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] tmp_model_path = args.model / "weighted_model" if tmp_model_path.is_dir(): print(f"{tmp_model_path} exists as a weighted model.") @@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) else: - model_plus = ModelPlus( - model={}, paths=[args.model / "dummy"], format="none", vocab=None - ) + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) return - endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG @@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None: params = Params.load(model_plus) if params.n_ctx == -1: if args.ctx is None: - raise Exception( - "The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n" - ) + raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" + "Please specify one with --ctx:\n" + " - LLaMA v1: --ctx 2048\n" + " - LLaMA v2: --ctx 4096\n") params.n_ctx = args.ctx if args.outtype: @@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None: if not args.outfile: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile - OutputFile.write_vocab_only( - outfile, - params, - vocab, - special_vocab, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None: vocab = model_plus.vocab - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_output_file(model_plus.paths, ftype) + print(f"Vocab info: {vocab}") + print(f"Special vocab info: {special_vocab}") + + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all( - outfile, - ftype, - params, - model, - vocab, - special_vocab, - concurrency=args.concurrency, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") -if __name__ == "__main__": - main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv +if __name__ == '__main__': + main() diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 11fcbf443..b7e19c5fe 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) { std::vector train_samples_begin; std::vector train_samples_size; printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); + printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); + printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); tokenize_file(lctx, params.common.fn_train_data, params.common.sample_start, diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 5a3d30b88..ea06fcdbf 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -26,6 +26,7 @@ struct StatParams { std::string ofile = "imatrix.dat"; int n_output_frequency = 10; int verbosity = 1; + int keep_every = 0; bool collect_output_weight = false; }; @@ -42,6 +43,9 @@ private: int m_last_call = 0; std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id + // + void save_imatrix(const char * file_name) const; + void keep_imatrix(int ncall) const; }; bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (m_last_call % m_params.n_output_frequency == 0) { save_imatrix(); } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } } } } else { @@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (m_last_call % m_params.n_output_frequency == 0) { save_imatrix(); } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } } } @@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } void IMatrixCollector::save_imatrix() const { - const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(); + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); +} + +void IMatrixCollector::keep_imatrix(int ncall) const { + auto file_name = m_params.ofile; + if (file_name.empty()) file_name = "imatrix.dat"; + file_name += ".at_"; + file_name += std::to_string(ncall); + save_imatrix(file_name.c_str()); +} + +void IMatrixCollector::save_imatrix(const char * fname) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); out.write((const char*)&n_entries, sizeof(n_entries)); @@ -248,7 +269,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } std::vector logit_history; - logit_history.resize(tokens.size()); - std::vector prob_history; - prob_history.resize(tokens.size()); + + if (compute_ppl) { + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + } const int n_chunk_max = tokens.size() / n_ctx; @@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { std::vector workers(std::thread::hardware_concurrency() - 1); + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + if (compute_ppl && num_batches > 1) { + logits.reserve((size_t)n_ctx * n_vocab); + } + for (int i = 0; i < n_chunk; ++i) { const int start = i * n_ctx; const int end = start + n_ctx; - const int num_batches = (n_ctx + n_batch - 1) / n_batch; - std::vector logits; const auto t_start = std::chrono::high_resolution_clock::now(); @@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // restore the original token in case it was set to BOS tokens[batch_start] = token_org; - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + if (compute_ppl && num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } } const auto t_end = std::chrono::high_resolution_clock::now(); @@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - const int first = n_ctx/2; - process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; + if (compute_ppl) { + const int first = n_ctx/2; + const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + + logits.clear(); + } } printf("\n"); - nll2 /= count; - nll /= count; - const double ppl = exp(nll); - nll2 -= nll * nll; - if (nll2 > 0) { - nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); - } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + if (compute_ppl) { + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } } return true; @@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { StatParams sparams; + bool compute_ppl = true; std::vector args; args.push_back(argv[0]); int iarg = 1; @@ -381,12 +419,21 @@ int main(int argc, char ** argv) { } else if (arg == "--verbosity") { sparams.verbosity = std::stoi(argv[++iarg]); + } else if (arg == "--no-ppl") { + compute_ppl = false; + } else if (arg == "--keep-imatrix") { + sparams.keep_every = std::stoi(argv[++iarg]); } else { args.push_back(argv[iarg]); } } if (iarg < argc) { - args.push_back(argv[iarg]); + std::string arg{argv[iarg]}; + if (arg == "--no-ppl") { + compute_ppl = false; + } else { + args.push_back(argv[iarg]); + } } gpt_params params; @@ -448,7 +495,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - bool OK = compute_imatrix(ctx, params); + bool OK = compute_imatrix(ctx, params, compute_ppl); if (!OK) { return 1; } diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 4a7827876..72fb133b4 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -241,7 +241,7 @@ int main(int argc, char ** argv) { LOG("add_bos: %d\n", add_bos); bool suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index 7815a8025..aadbe22c9 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -30,6 +30,7 @@ android { } externalNativeBuild { cmake { + arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() } diff --git a/examples/llama.vim b/examples/llama.vim index f03fadfb7..1b5ad6ba0 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -6,7 +6,7 @@ " Similarly, you could add an insert mode keybind with " inoremap call llama#doLlamaGen() " -" g:llama_api_url and g:llama_overrides can be configured in your .vimrc +" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc " let g:llama_api_url = "192.168.1.10:8080" " llama_overrides can also be set through buffer/window scopes. For instance " autocmd filetype python let b:llama_overrides = {"temp": 0.2} @@ -82,6 +82,9 @@ func llama#doLlamaGen() endif let l:querydata.prompt = join(l:buflines, "\n") let l:curlcommand = copy(s:curlcommand) + if exists("g:llama_api_key") + call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) + endif let l:curlcommand[2] = json_encode(l:querydata) let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) endfunction diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md new file mode 100644 index 000000000..c6258eba6 --- /dev/null +++ b/examples/llava/MobileVLM-README.md @@ -0,0 +1,131 @@ +# MobileVLM + +Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. + +for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) + +The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. + +## Usage +Build with cmake or run `make llava-cli` to build it. + +After building, run: `./llava-cli` to see the usage. For example: + +```sh +./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ + --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ + --image path/to/an/image.jpg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" +``` + +## Model conversion + +- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: + +```sh +git clone https://huggingface.co/mtgv/MobileVLM-1.7B + +git clone https://huggingface.co/openai/clip-vit-large-patch14-336 +``` + +2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: + +```sh +python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B +``` + +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: + +```sh +python ./examples/llava/convert-image-encoder-to-gguf \ + -m path/to/clip-vit-large-patch14-336 \ + --llava-projector path/to/MobileVLM-1.7B/llava.projector \ + --output-dir path/to/MobileVLM-1.7B \ + --projector-type ldp +``` + +4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: + +```sh +python ./convert.py path/to/MobileVLM-1.7B +``` + +5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` +```sh +./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +``` + +Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. + +## Android compile and run +### compile +refer to `examples/llava/android/build_64.sh` +```sh +mkdir examples/llava/android/build_64 +cd examples/llava/android/build_64 +../build_64.sh +``` +### run on Android +refer to `android/adb_run.sh`, modify resources' `name` and `path` + +## some result on Android with `Snapdragon 888` chip +### case 1 +**input** +```sh +/data/local/tmp/llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -t 4 \ + --image /data/local/tmp/demo.jpg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" +``` +**output** +```sh +encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch) + Susan Wise Bauer +llama_print_timings: load time = 23574.72 ms +llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second) +llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second) +llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second) +llama_print_timings: total time = 34731.93 ms +``` +### case 2 +**input** +```sh +/data/local/tmp/llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -t 4 \ + --image /data/local/tmp/cat.jpeg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" +``` + +**output** +```sh +encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch) + The image depicts a cat sitting in the grass near some tall green plants. +llama_print_timings: load time = 23257.32 ms +llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second) +llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second) +llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second) +llama_print_timings: total time = 34570.79 ms +``` + +## Minor shortcomings +The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost. + +## TODO + +- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid` +- [ ] Optimize LDP projector performance + + - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`; + - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc. +- [ ] run MobileVLM on `Jetson Orin` +- [ ] Support more model variants, such as `MobileVLM-3B`. + + +## contributor +```sh +zhangjidong05, yangyang260, huyiming03, chenxiaotao03 +``` diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh new file mode 100755 index 000000000..f73623ae3 --- /dev/null +++ b/examples/llava/android/adb_run.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed" +projector_name="mmproj-model-f16.gguf" +llama_name="ggml-model-q4_k.gguf" +img_dir="/Users/cxt/model/llm" +img_name="demo.jpg" +prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" +# img_name="cat.jpeg" +# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" + +program_dir="build_64/bin" +binName="llava-cli" +n_threads=4 + + +deviceDir="/data/local/tmp" +saveDir="output" +if [ ! -d ${saveDir} ]; then + mkdir ${saveDir} +fi + + +function android_run() { + # # copy resource into device + # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name} + # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name} + adb push ${img_dir}/${img_name} ${deviceDir}/${img_name} + # copy program into device + adb push ${program_dir}/${binName} ${deviceDir}/${binName} + adb shell "chmod 0777 ${deviceDir}/${binName}" + + # run + adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \ + -m ${deviceDir}/${llama_name} \ + --mmproj ${deviceDir}/${projector_name} \ + -t ${n_threads} \ + --image ${deviceDir}/${img_name} \ + -p \"${prompt}\" \ + > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt" + adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \ + -m ${deviceDir}/${llama_name} \ + --mmproj ${deviceDir}/${projector_name} \ + -t ${n_threads} \ + --image ${deviceDir}/${img_name} \ + -p \"${prompt}\" \ + >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1" + adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir} +} + +android_run + +echo "android_run is Done!" diff --git a/examples/llava/android/build_64.sh b/examples/llava/android/build_64.sh new file mode 100755 index 000000000..71b6fd3f7 --- /dev/null +++ b/examples/llava/android/build_64.sh @@ -0,0 +1,8 @@ +#!/bin/bash +cmake ../../../../ \ +-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ +-DCMAKE_BUILD_TYPE=Release \ +-DANDROID_ABI="arm64-v8a" \ +-DANDROID_PLATFORM=android-23 $1 + +make -j4 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2ae8853d3..9129052a2 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2,17 +2,6 @@ // so there might be still unnecessary artifacts hanging around // I'll gradually clean and extend it -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include "clip.h" #include "ggml.h" #include "ggml-alloc.h" @@ -29,6 +18,19 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + static std::string format(const char * fmt, ...) { va_list ap; va_list ap2; @@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) { #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" // // tensor name constants @@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) { #define TN_TEXT_PROJ "text_projection.weight" #define TN_VIS_PROJ "visual_projection.weight" #define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" + + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, +}; + // // utilities to get data from a gguf file @@ -129,6 +148,91 @@ static std::string get_ftype(int ftype) { return ggml_type_name(static_cast(ftype)); } +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") { + size_t tensor_size = ggml_nbytes(tensor); + printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + prefix, ggml_n_dims(tensor), tensor->name, tensor_size, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); +} + +static projector_type clip_projector_type_from_string(const std::string & name) { + for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + // // image data // @@ -201,10 +305,44 @@ struct clip_vision_model { struct ggml_tensor * projection; // LLaVA projection - struct ggml_tensor * mm_0_w; - struct ggml_tensor * mm_0_b; - struct ggml_tensor * mm_2_w; - struct ggml_tensor * mm_2_b; + struct ggml_tensor * mm_0_w = NULL; + struct ggml_tensor * mm_0_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_3_w = NULL; + struct ggml_tensor * mm_3_b = NULL; + struct ggml_tensor * mm_4_w = NULL; + struct ggml_tensor * mm_4_b = NULL; + + // MobileVLM projection + struct ggml_tensor * mm_model_mlp_1_w; + struct ggml_tensor * mm_model_mlp_1_b; + struct ggml_tensor * mm_model_mlp_3_w; + struct ggml_tensor * mm_model_mlp_3_b; + struct ggml_tensor * mm_model_block_1_block_0_0_w; + struct ggml_tensor * mm_model_block_1_block_0_1_w; + struct ggml_tensor * mm_model_block_1_block_0_1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc1_w; + struct ggml_tensor * mm_model_block_1_block_1_fc1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc2_w; + struct ggml_tensor * mm_model_block_1_block_1_fc2_b; + struct ggml_tensor * mm_model_block_1_block_2_0_w; + struct ggml_tensor * mm_model_block_1_block_2_1_w; + struct ggml_tensor * mm_model_block_1_block_2_1_b; + struct ggml_tensor * mm_model_block_2_block_0_0_w; + struct ggml_tensor * mm_model_block_2_block_0_1_w; + struct ggml_tensor * mm_model_block_2_block_0_1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc1_w; + struct ggml_tensor * mm_model_block_2_block_1_fc1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc2_w; + struct ggml_tensor * mm_model_block_2_block_1_fc2_b; + struct ggml_tensor * mm_model_block_2_block_2_0_w; + struct ggml_tensor * mm_model_block_2_block_2_1_w; + struct ggml_tensor * mm_model_block_2_block_2_1_b; }; struct clip_ctx { @@ -213,6 +351,7 @@ struct clip_ctx { bool has_llava_projector = false; struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; float image_mean[3]; float image_std[3]; @@ -330,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } @@ -430,16 +570,156 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 free(patches_data); } + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] embeddings = ggml_get_rows(ctx0, embeddings, patches); - // mm projection 0 - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // print_tensor_info(embeddings, "embeddings"); - embeddings = ggml_gelu(ctx0, embeddings); + // llava projector + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + embeddings = ggml_gelu(ctx0, embeddings); + + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + struct ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); + mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else { + GGML_ASSERT(false); + } } // build the graph @@ -485,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); + // kv - if (verbosity >= 3) { - const int n_kv = gguf_get_n_kv(ctx); + const int n_kv = gguf_get_n_kv(ctx); + printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + __func__, n_kv, n_tensors, fname); + { + std::map n_type; - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ctx, i); + for (int i = 0; i < n_tensors; i++) { + enum ggml_type type = gguf_get_tensor_type(ctx, i); - printf("%s: kv[%d]: key = %s\n", __func__, i, key); + n_type[type]++; + } + + printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx, i); + const enum gguf_type type = gguf_get_kv_type(ctx, i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(ctx, i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } - printf("\n"); } // data @@ -503,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); + enum ggml_type type = gguf_get_tensor_type(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(meta, name); size_t tensor_size = ggml_nbytes(cur); buffer_size += tensor_size; if (verbosity >= 3) { - printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i, - ggml_n_dims(cur), cur->name, tensor_size, offset); + printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } } @@ -517,6 +829,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { clip_ctx * new_clip = new clip_ctx; + // update projector type + { + int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); + if (idx != -1) { + const std::string proj_type = gguf_get_val_str(ctx, idx); + new_clip->proj_type = clip_projector_type_from_string(proj_type); + } + else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; + } + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { + new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } + } + } + #ifdef GGML_USE_CUBLAS new_clip->backend = ggml_backend_cuda_init(0); printf("%s: CLIP using CUDA backend\n", __func__); @@ -661,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); - vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + + // LLaVA projection + if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + try { + // Yi-type llava + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); + } catch (std::runtime_error & e) { } + try { + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); + vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); + vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); + } catch (std::runtime_error & e) { } + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projection + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } vision_model.layers.resize(hparams.n_layer); for (int il = 0; il < hparams.n_layer; ++il) { @@ -949,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i ".*weight", }; - std::vector read_data(512); std::vector work(512); std::vector conv_buf(512); std::vector hist_all(1 << 4, 0); @@ -1100,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - return ctx->vision_model.mm_2_b->ne[0]; + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + } + else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + return ctx->vision_model.mm_2_b->ne[0]; + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } } int clip_n_patches(const struct clip_ctx * ctx) { auto & params = ctx->vision_model.hparams; - - return (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + n_patches /= 4; + } + return n_patches; } size_t clip_embd_nbytes(const struct clip_ctx * ctx) { diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 03688e0ea..f5a3c9b46 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False, ap.add_argument("--clip_model_is_vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) @@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector: fout.add_description("vision-only CLIP model") elif has_llava_projector: fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) else: fout.add_description("two-tower CLIP model") @@ -218,7 +221,8 @@ if has_llava_projector: projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) - if data.ndim == 2: + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: data = data.squeeze().numpy().astype(np.float16) else: data = data.squeeze().numpy().astype(np.float32) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index d94795fe3..6ac70ba69 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); - // llava chat format is "\nUSER:\n\nASSISTANT:" - eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find(""); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("").length()); + // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string + size_t pos = 0; + while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { + user_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { + system_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + + printf("system_prompt: %s\n", system_prompt.c_str()); + printf("user_prompt: %s\n", user_prompt.c_str()); + } else { + // llava-1.5 native mode + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); // generate the response @@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp); fflush(stdout); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b07320190..8d2204969 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; } +static inline int nearest_int(float fval) { + //assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) { + float max_logit = logits[0]; + float min_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + min_logit = std::min(min_logit, logits[i]); + } + min_logit = std::max(min_logit, max_logit - 16); + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + const float log_sum_exp = log(sum_exp); + const float min_log_prob = min_logit - max_logit - log_sum_exp; + const float scale = (max_logit - min_logit)/65535.f; + float * d = (float *)log_prob; + d[0] = scale; + d[1] = min_log_prob; + log_prob += 4; + if (scale) { + const float inv_scale = 1/scale; + for (int i = 0; i < n_vocab; ++i) { + log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0; + } + } else { + std::memset(log_prob, 0, n_vocab*sizeof(uint16_t)); + } + return max_logit + log_sum_exp - logits[tok]; +} + static void process_logits( int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, double & nll, double & nll2, float * logit_history, float * prob_history @@ -147,6 +184,130 @@ static void process_logits( } } +static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token, + std::vector & workers, std::vector & log_probs, double & nll, double & nll2) { + std::mutex mutex; + const int nv = 2*((n_vocab + 1)/2) + 4; + int counter = 0; + auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + local_nll += v; + local_nll2 += v*v; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } + out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t)); +} + +struct kl_divergence_result { + double sum_nll = 0; + double sum_nll2 = 0; + double sum_kld = 0; + double sum_kld2 = 0; + double sum_nll_diff = 0; + double sum_nll_diff2 = 0; + size_t n_same_top = 0; + size_t count = 0; +}; + +static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { + float max_logit = logits[0]; + int imax = 0; + for (int i = 1; i < n_vocab; ++i) { + if (logits[i] > max_logit) { + max_logit = logits[i]; + imax = i; + } + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + const float log_sum_exp = log(sum_exp); + const float * d = (const float *)base_log_prob; + const float scale = d[0]; + const float min_log_prob = d[1]; + base_log_prob += 4; + float nll = max_logit + log_sum_exp - logits[tok]; + kld.sum_nll += nll; + kld.sum_nll2 += nll*nll; + nll += (scale*base_log_prob[tok] + min_log_prob); + kld.sum_nll_diff += nll; + kld.sum_nll_diff2 += nll*nll; + max_logit += log_sum_exp; + double sum = 0; + int imax_base = -1; + float p_log_base_max = 0; + for (int i = 0; i < n_vocab; ++i) { + const float p_log_base = scale*base_log_prob[i] + min_log_prob; + if (i == 0 || p_log_base > p_log_base_max) { + p_log_base_max = p_log_base; + imax_base = i; + } + if (p_log_base > -16.f) { + const float p_base = expf(p_log_base); + sum += p_base * (p_log_base - logits[i] + max_logit); + } + } + kld.sum_kld += sum; + kld.sum_kld2 += sum*sum; + ++kld.count; + if (imax == imax_base) ++kld.n_same_top; + return sum; +} + +static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, + std::vector & workers, const std::vector & base_log_probs, kl_divergence_result & kld, + float * kld_values) { + std::mutex mutex; + const int nv = 2*((n_vocab + 1)/2) + 4; + int counter = 0; + auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () { + kl_divergence_result local_kld; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + kld.sum_nll += local_kld.sum_nll; + kld.sum_nll2 += local_kld.sum_nll2; + kld.sum_kld += local_kld.sum_kld; + kld.sum_kld2 += local_kld.sum_kld2; + kld.sum_nll_diff += local_kld.sum_nll_diff; + kld.sum_nll_diff2 += local_kld.sum_nll_diff2; + kld.n_same_top += local_kld.n_same_top; + kld.count += local_kld.count; + break; + } + lock.unlock(); + double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + kld_values[i] = (float)v; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` @@ -294,6 +455,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); + std::ofstream logits_stream; + if (!params.logits_file.empty()) { + logits_stream.open(params.logits_file.c_str()); + if (!logits_stream.is_open()) { + fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); + return {}; + } + fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); + logits_stream.write("_logits_", 8); + logits_stream.write((const char *)&n_ctx, sizeof(n_ctx)); + } + auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); @@ -336,6 +509,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector workers(std::thread::hardware_concurrency() - 1); + std::vector log_probs; + if (!params.logits_file.empty()) { + logits_stream.write((const char *)&n_vocab, sizeof(n_vocab)); + logits_stream.write((const char *)&n_chunk, sizeof(n_chunk)); + logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0])); + const int nv = 2*((n_vocab + 1)/2) + 4; + log_probs.resize(n_ctx * nv); + } + for (int i = 0; i < n_chunk; ++i) { const int start = i * n_ctx; const int end = start + n_ctx; @@ -398,8 +580,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // process the entire prompt. const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + if (!params.logits_file.empty()) { + process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, log_probs, nll, nll2); + } else { + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + } count += n_ctx - first - 1; // perplexity is e^(average negative log-likelihood) @@ -458,23 +645,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< return true; } +#define K_TOKEN_CHUNK 4 + static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector& workers, const std::vector>& eval_pairs, std::vector& eval_results) { - constexpr int k_token_chunk = 4; if (eval_results.size() != eval_pairs.size()) { eval_results.resize(eval_pairs.size()); } if (eval_pairs.empty()) return; - size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size()); + size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); std::atomic counter(0); auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { - float local_logprobs[k_token_chunk]; + float local_logprobs[K_TOKEN_CHUNK]; while (true) { - size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed); + size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); if (first >= eval_results.size()) break; - size_t last = std::min(first + k_token_chunk, eval_results.size()); + size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); for (size_t i = first; i < last; ++i) { auto logits = batch_logits + eval_pairs[i].first * n_vocab; float max_logit = logits[0]; @@ -497,7 +685,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto for (size_t it = 0; it < max_threads; ++it) { workers[it].join(); } - } static void hellaswag_score(llama_context * ctx, const gpt_params & params) { @@ -540,14 +727,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // This is needed as usual for LLaMA models const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + // The tasks should be randomized so the score stabilizes quickly. + bool randomize_tasks = true; + // Number of tasks to use when computing the score if (params.hellaswag_tasks < hs_task_count) { hs_task_count = params.hellaswag_tasks; } - // The tasks should be randomized so the score stabilizes quickly. - bool randomize_tasks = true; - // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now std::mt19937 rng(1); @@ -1031,6 +1218,566 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); } +static bool deserialize_string(std::istream & in, std::string & str) { + uint32_t size; + if (!in.read((char *)&size, sizeof(size)).fail()) { + str.resize(size); + if (!in.read((char *)&str[0], size).fail()) return true; + } + return false; +} + +struct multiple_choice_answers { + std::vector answers; + std::vector labels; + bool deserialize(std::istream& in) { + uint32_t n; + in.read((char *)&n, sizeof(n)); + if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose + answers.resize(n); + labels.resize(n); + for (auto& a : answers) { + if (!deserialize_string(in, a)) return false; + } + in.read((char *)labels.data(), n*sizeof(int)); + return !in.fail(); + } +}; + +struct multiple_choice_task { + std::string question; // the question (or context that needs to be continued) + multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer + multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet + bool deserialize(std::istream& in) { + if (!deserialize_string(in, question)) return false; + return mc1.deserialize(in) && mc2.deserialize(in); + } + + // For evaluation + size_t i_batch; // starting index in the llama_batch + size_t common_prefix; // max number of initial tokens that are the same in all sentences + size_t required_tokens; // needed number of tokens to evaluate all answers + std::vector> seq_tokens; + std::vector log_probs; +}; + +static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) { + if (task.question.empty() || task.mc1.answers.empty()) { + if (log_error) { + printf("%s: found bad task with empty question and/or answers\n", __func__); + } + return false; + } + task.seq_tokens.reserve(task.mc1.answers.size()); + for (auto& answer : task.mc1.answers) { + if (answer.empty()) { + if (log_error) { + printf("%s: found empty answer\n", __func__); + } + return false; + } + task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos)); + } + auto min_len = task.seq_tokens.front().size(); + for (auto& seq : task.seq_tokens) { + min_len = std::min(min_len, seq.size()); + } + task.common_prefix = 0; + for (size_t k = 0; k < min_len; ++k) { + auto token = task.seq_tokens[0][k]; + bool all_same = true; + for (size_t i = 1; i < task.seq_tokens.size(); ++i) { + if (task.seq_tokens[i][k] != token) { + all_same = false; + break; + } + } + if (!all_same) { + break; + } + ++task.common_prefix; + } + task.required_tokens = task.common_prefix; + for (auto& seq : task.seq_tokens) { + task.required_tokens += seq.size() - task.common_prefix; + } + return true; +} + +// +// Calculates score for multiple choice tasks with single correct answer from prompt. +// Commonly used LLM evaluation metrics of this type are +// * ARC +// * HellaSwag +// * MMLU +// * TruthfulQA +// +// Validation datasets for these 4 tests can be found at +// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp +// The data for these datasets was extracted from +// git@hf.co:datasets/allenai/ai2_arc +// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl +// git@hf.co:datasets/Stevross/mmlu +// https://huggingface.co/datasets/truthful_qa +// +static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { + + std::istringstream strstream(params.prompt); + uint32_t n_task; + strstream.read((char *)&n_task, sizeof(n_task)); + if (strstream.fail() || n_task == 0) { + printf("%s: no tasks\n", __func__); + return; + } + printf("%s: there are %u tasks in prompt\n", __func__, n_task); + std::vector task_pos(n_task); + strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); + if (strstream.fail()) { + printf("%s: failed to raad task positions from prompt\n", __func__); + return; + } + + std::vector tasks; + if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { + // Use all tasks + tasks.resize(n_task); + printf("%s: reading tasks", __func__); + int n_dot = n_task/100; + int i = 0; + for (auto& task : tasks) { + ++i; + if (!task.deserialize(strstream)) { + printf("%s: failed to read task %d of %u\n", __func__, i, n_task); + return; + } + if (i%n_dot == 0) printf("."); + } + printf("done\n"); + } + else { + printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); + std::mt19937 rng(1); + std::vector aux(n_task); + for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; + float scale = 1.f/(1.f + (float)std::mt19937::max()); + tasks.resize(params.multiple_choice_tasks); + for (auto& task : tasks) { + int j = (int)(scale * rng() * aux.size()); + int idx = aux[j]; + aux[j] = aux.back(); + aux.pop_back(); + strstream.seekg(task_pos[idx], std::ios::beg); + if (!task.deserialize(strstream)) { + printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); + return; + } + } + n_task = params.multiple_choice_tasks; + } + + // This is needed as usual for LLaMA models + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + printf("%s: preparing task data", __func__); + fflush(stdout); + if (n_task > 500) { + printf("..."); + fflush(stdout); + std::atomic counter(0); + std::atomic n_bad(0); + auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () { + int num_tasks = tasks.size(); + int n_bad_local = 0; + while (true) { + int first = counter.fetch_add(K_TOKEN_CHUNK); + if (first >= num_tasks) { + if (n_bad_local > 0) n_bad += n_bad_local; + break; + } + int last = std::min(first + K_TOKEN_CHUNK, num_tasks); + for (int i = first; i < last; ++i) { + if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local; + } + } + }; + size_t max_thread = std::thread::hardware_concurrency(); + max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK); + std::vector workers(max_thread-1); + for (auto& w : workers) w = std::thread(prepare); + prepare(); + for (auto& w : workers) w.join(); + printf("done\n"); + fflush(stdout); + int nbad = n_bad; + if (nbad > 0) { + printf("%s: found %d malformed tasks\n", __func__, nbad); + return; + } + } else { + int n_dot = n_task/100; + int i_task = 0; + for (auto& task : tasks) { + ++i_task; + if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) { + return; + } + if (i_task%n_dot == 0) { + printf("."); + fflush(stdout); + } + } + printf("done\n"); + } + + printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); + + printf("\ntask\tacc_norm\n"); + + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + const int n_batch = params.n_batch; + + const int max_tasks_per_batch = 32; + const int max_seq = 4*max_tasks_per_batch; + + llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + + std::vector tok_logits(n_vocab); + std::vector batch_logits(n_vocab*n_ctx); + + std::vector> eval_pairs; + std::vector eval_results; + std::vector workers(std::thread::hardware_concurrency()); + std::vector batch_indeces; + + int n_done = 0; + int n_correct = 0; + int n_tot_answers = 0; + + for (size_t i0 = 0; i0 < tasks.size(); i0++) { + int n_cur = 0; + + size_t i1 = i0; + size_t i_batch = 0; // this tells us where in `llama_batch` we are currently + + llama_batch_clear(batch); + + // batch as much tasks as possible into the available context + // each task has 4 unique seuqnce ids - one for each ending + // the common prefix is shared among the 4 sequences to save tokens + // we extract logits only from the last common token and from all ending tokens of each sequence + int s0 = 0; + while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) { + auto& cur_task = tasks[i1]; + + int num_answers = cur_task.seq_tokens.size(); + if (s0 + num_answers > max_seq) { + break; + } + + if (int(batch_indeces.size()) != num_answers) { + batch_indeces.resize(num_answers); + } + for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s; + + for (size_t i = 0; i < cur_task.common_prefix; ++i) { + //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); + llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); + } + batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix + + for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { + for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) { + llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true); + } + } + + s0 += num_answers; + + cur_task.i_batch = i_batch; + i_batch += cur_task.required_tokens; + + n_cur += cur_task.required_tokens; + if (++i1 == tasks.size()) { + break; + } + } + + if (i0 == i1) { + fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + return; + } + + llama_kv_cache_clear(ctx); + + // decode all tasks [i0, i1) + if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { + fprintf(stderr, "%s: llama_decode() failed\n", __func__); + return; + } + + // Compute log-probs in parallel + // First we collect all tasks + eval_pairs.clear(); + for (size_t i = i0; i < i1; ++i) { + auto& cur_task = tasks[i]; + size_t li = cur_task.common_prefix; + for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { + for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { + eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1])); + } + ++li; + } + } + // Then we do the actual calculation + compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); + + size_t ir = 0; + + // compute the logprobs for each ending of the decoded tasks + for (size_t i = i0; i < i1; ++i) { + auto & cur_task = tasks[i]; + //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); + //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { + // if (cur_task.mc1.labels[j] == 1) { + // printf("%d", j+1); + // } + //} + //printf("\n common_prefix: %zu\n", cur_task.common_prefix); + + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float)); + + const auto first_probs = softmax(tok_logits); + + cur_task.log_probs.resize(cur_task.seq_tokens.size()); + for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { + size_t count = 1; + float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); + for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { + //printf(" %zu %g\n", ir, eval_results[ir]); + ++count; + log_prob += eval_results[ir++]; + } + cur_task.log_probs[s] = log_prob / count; + //printf(" Final: %g\n", log_prob / count); + //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); + } + + // Find the ending with maximum logprob + size_t logprob_max_idx = 0; + float logprob_max_val = cur_task.log_probs[0]; + for (size_t s = 1; s < cur_task.log_probs.size(); s++) { + if (cur_task.log_probs[s] > logprob_max_val) { + logprob_max_val = cur_task.log_probs[s]; + logprob_max_idx = s; + } + } + + n_tot_answers += cur_task.log_probs.size(); + if (cur_task.mc1.labels[logprob_max_idx] == 1) { + ++n_correct; + } + ++n_done; + + // Print the accumulated accuracy mean x 100 + printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); + fflush(stdout); + } + + i0 = i1 - 1; + } + + llama_batch_free(batch); + + if (n_done < 100) return; + + float p = 1.f*n_correct/n_done; + float sigma = sqrt(p*(1-p)/(n_done-1)); + printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + p = 1.f*n_done/n_tot_answers; + sigma = sqrt(p*(1-p)/(n_done-1)); + printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + + printf("\n"); +} + +static void kl_divergence(llama_context * ctx, const gpt_params & params) { + if (params.logits_file.empty()) { + fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); + return; + } + std::ifstream in(params.logits_file.c_str(), std::ios::binary); + if (!in) { + fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); + return; + } + { + char check[9]; check[8] = 0; + in.read(check, 8); + if (in.fail() || strncmp("_logits_", check, 8) != 0) { + fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); + return; + } + } + + uint32_t n_ctx; + in.read((char *)&n_ctx, sizeof(n_ctx)); + if (n_ctx > llama_n_ctx(ctx)) { + fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n", + __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); + } + + int n_vocab, n_chunk; + in.read((char *)&n_vocab, sizeof(n_vocab)); + in.read((char *)&n_chunk, sizeof(n_chunk)); + if (in.fail()) { + fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); + return; + } + if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { + fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); + } + + std::vector tokens(n_ctx * n_chunk); + if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { + fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); + return; + } + + const int n_batch = params.n_batch; + const int num_batches = (n_ctx + n_batch - 1)/n_batch; + const int nv = 2*((n_vocab + 1)/2) + 4; + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); + std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); + std::vector logits; + if (num_batches > 1) { + logits.reserve(n_ctx * n_vocab); + } + + std::vector workers(std::thread::hardware_concurrency() - 1); + + auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) { + if (count < 1) { + return std::make_pair(0., 0.); + } + double f = sum/count; + double df = sum2/count - f*f; + df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.; + return std::make_pair(f, df); + }; + + kl_divergence_result kld; + auto kld_ptr = kld_values.data(); + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { + fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); + return; + } + + // clear the KV cache + llama_kv_cache_clear(ctx); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + } + + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + + // restore the original token in case it was set to BOS + tokens[batch_start] = token_org; + + if (num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + + printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n"); + } + + const int first = n_ctx/2; + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, log_probs_uint16, kld, kld_ptr); + kld_ptr += n_ctx - 1 - first; + + auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); + auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count); + auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); + auto p_top = 1.*kld.n_same_top/kld.count; + auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1)); + + printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first), + log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second, + p_top, d_p_top); + + fflush(stdout); + + logits.clear(); + } + printf("\n"); + + if (kld.count < 100) return; // we do not wish to do statistics on so few values + + std::sort(kld_values.begin(), kld_values.end()); + + printf("===== KL-divergence statistics\n"); + auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); + printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second); + auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) + : kld_values[kld_values.size()/2]; + printf("Median : %10.6f\n", kld_median); + + auto percentile = [&kld_values] (float fraction) { + if (fraction <= 0) return kld_values.front(); + if (fraction >= 1) return kld_values.back(); + float p = fraction*(kld_values.size() - 1); + size_t ip = size_t(p); p -= ip; + return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)]; + }; + + printf("Maximum: %10.6f\n", kld_values.back()); + printf("KLD_99 : %10.6f\n", percentile(0.99f)); + printf("KLD_95 : %10.6f\n", percentile(0.95f)); + printf("KLD_90 : %10.6f\n", percentile(0.90f)); + + printf("Minimum: %10.6f\n", kld_values.front()); + printf("KLD_01 : %10.6f\n", percentile(0.01f)); + printf("KLD_05 : %10.6f\n", percentile(0.05f)); + printf("KLD_10 : %10.6f\n", percentile(0.10f)); + +} int main(int argc, char ** argv) { gpt_params params; @@ -1091,6 +1838,10 @@ int main(int argc, char ** argv) { hellaswag_score(ctx, params); } else if (params.winogrande) { winogrande_score(ctx, params); + } else if (params.multiple_choice) { + multiple_choice_score(ctx, params); + } else if (params.kl_divergence) { + kl_divergence(ctx, params); } else { results = perplexity(ctx, params); } diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py index cbf376652..160966649 100644 --- a/examples/pydantic-models-to-grammar-examples.py +++ b/examples/pydantic-models-to-grammar-examples.py @@ -1,14 +1,14 @@ # Function calling example using pydantic models. import datetime +import importlib import json from enum import Enum -from typing import Union, Optional +from typing import Optional, Union import requests from pydantic import BaseModel, Field - -import importlib -from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function +from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model, + create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) # Function to get completion on the llama.cpp server with grammar. @@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel): print(self.message) -# Enum for the calculator function. +# Enum for the calculator tool. class MathOperation(Enum): ADD = "add" SUBTRACT = "subtract" @@ -43,7 +43,7 @@ class MathOperation(Enum): DIVIDE = "divide" -# Very simple calculator tool for the agent. +# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. class Calculator(BaseModel): """ Perform a math operation on two numbers. @@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None): return datetime.datetime.now().strftime(output_format) -# Enum for the calculator tool. -class MathOperation(Enum): - ADD = "add" - SUBTRACT = "subtract" - MULTIPLY = "multiply" - DIVIDE = "divide" - - - -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. -class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ - number_one: Union[int, float] = Field(..., description="First number.") - operation: MathOperation = Field(..., description="Math operation to perform.") - number_two: Union[int, float] = Field(..., description="Second number.") - - def run(self): - if self.operation == MathOperation.ADD: - return self.number_one + self.number_two - elif self.operation == MathOperation.SUBTRACT: - return self.number_one - self.number_two - elif self.operation == MathOperation.MULTIPLY: - return self.number_one * self.number_two - elif self.operation == MathOperation.DIVIDE: - return self.number_one / self.number_two - else: - raise ValueError("Unknown operation.") - - # Example function to get the weather def get_current_weather(location, unit): """Get the current weather in a given location""" diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 848c1c367..9acc7cc6d 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -1,15 +1,21 @@ +from __future__ import annotations + import inspect import json +import re from copy import copy -from inspect import isclass, getdoc -from types import NoneType +from enum import Enum +from inspect import getdoc, isclass +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints from docstring_parser import parse -from pydantic import BaseModel, create_model, Field -from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias -from enum import Enum -from typing import get_type_hints, Callable -import re +from pydantic import BaseModel, Field, create_model + +if TYPE_CHECKING: + from types import GenericAlias +else: + # python 3.8 compat + from typing import _GenericAlias as GenericAlias class PydanticDataType(Enum): @@ -43,7 +49,7 @@ class PydanticDataType(Enum): SET = "set" -def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: +def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str: if isclass(pydantic_type) and issubclass(pydantic_type, str): return PydanticDataType.STRING.value elif isclass(pydantic_type) and issubclass(pydantic_type, bool): @@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): return format_model_and_field_name(pydantic_type.__name__) - elif get_origin(pydantic_type) == list: + elif get_origin(pydantic_type) is list: element_type = get_args(pydantic_type)[0] return f"{map_pydantic_type_to_gbnf(element_type)}-list" - elif get_origin(pydantic_type) == set: + elif get_origin(pydantic_type) is set: element_type = get_args(pydantic_type)[0] return f"{map_pydantic_type_to_gbnf(element_type)}-set" - elif get_origin(pydantic_type) == Union: + elif get_origin(pydantic_type) is Union: union_types = get_args(pydantic_type) union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] return f"union-{'-or-'.join(union_rules)}" - elif get_origin(pydantic_type) == Optional: + elif get_origin(pydantic_type) is Optional: element_type = get_args(pydantic_type)[0] return f"optional-{map_pydantic_type_to_gbnf(element_type)}" elif isclass(pydantic_type): return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" - elif get_origin(pydantic_type) == dict: + elif get_origin(pydantic_type) is dict: key_type, value_type = get_args(pydantic_type) return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" else: @@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name): return f"{cls.__name__.lower()} ::= " + " | ".join(members) if cls.__annotations__ and cls.__annotations__ != {}: result = f'{rule_name} ::= "{{"' - type_list_rules = [] # Modify this comprehension members = [ f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' @@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name): result += '"," '.join(members) result += ' "}"' - return result, type_list_rules - elif rule_name == "custom-class-any": + return result + if rule_name == "custom-class-any": result = f"{rule_name} ::= " result += "value" - type_list_rules = [] - return result, type_list_rules - else: - init_signature = inspect.signature(cls.__init__) - parameters = init_signature.parameters - result = f'{rule_name} ::= "{{"' - type_list_rules = [] - # Modify this comprehension too - members = [ - f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' - for name, param in parameters.items() - if name != "self" and param.annotation != inspect.Parameter.empty - ] + return result - result += '", "'.join(members) - result += ' "}"' - return result, type_list_rules + init_signature = inspect.signature(cls.__init__) + parameters = init_signature.parameters + result = f'{rule_name} ::= "{{"' + # Modify this comprehension too + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != "self" and param.annotation != inspect.Parameter.empty + ] + + result += '", "'.join(members) + result += ' "}"' + return result def regex_to_gbnf(regex_pattern: str) -> str: @@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None def generate_gbnf_rule_for_type( model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None -) -> Tuple[str, list]: +) -> tuple[str, list[str]]: """ Generate GBNF rule for a given field type. @@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type( :param field_info: Additional information about the field (optional). :return: Tuple containing the GBNF type and a list of additional rules. - :rtype: Tuple[str, list] + :rtype: tuple[str, list] """ rules = [] @@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type( gbnf_type, rules = model_name + "-" + field_name, rules elif gbnf_type.startswith("custom-class-"): - nested_model_rules, field_types = get_members_structure(field_type, gbnf_type) - rules.append(nested_model_rules) + rules.append(get_members_structure(field_type, gbnf_type)) elif gbnf_type.startswith("custom-dict-"): key_type, value_type = get_args(field_type) @@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type( union_rules = [] for union_type in union_types: - if isinstance(union_type, _GenericAlias): + if isinstance(union_type, GenericAlias): union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( model_name, field_name, union_type, False, processed_models, created_rules ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) - elif not issubclass(union_type, NoneType): + elif not issubclass(union_type, type(None)): union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( model_name, field_name, union_type, False, processed_models, created_rules ) @@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type( else: gbnf_type, rules = gbnf_type, [] - if gbnf_type not in created_rules: - return gbnf_type, rules - else: - if gbnf_type in created_rules: - return gbnf_type, rules + return gbnf_type, rules -def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool): +def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]: """ Generate GBnF Grammar @@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created ``` """ if model in processed_models: - return [] + return [], False processed_models.add(model) model_name = format_model_and_field_name(model.__name__) @@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created def generate_gbnf_grammar_from_pydantic_models( - models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None, + models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None, list_of_outputs: bool = False ) -> str: """ @@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models( * grammar. Args: - models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. list_of_outputs (str, optional): Allows a list of output objects @@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models( # root ::= UserModel | PostModel # ... """ - processed_models = set() + processed_models: set[type[BaseModel]] = set() all_rules = [] - created_rules = {} + created_rules: dict[str, list[str]] = {} if outer_object_name is None: for model in models: model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules) @@ -608,7 +606,7 @@ def get_primitive_grammar(grammar): Returns: str: GBNF primitive grammar string. """ - type_list = [] + type_list: list[type[object]] = [] if "string-list" in grammar: type_list.append(str) if "boolean-list" in grammar: @@ -666,14 +664,14 @@ triple-quotes ::= "'''" """ def generate_markdown_documentation( - pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields", documentation_with_field_description=True ) -> str: """ Generate markdown documentation for a list of Pydantic models. Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + pydantic_models (list[type[BaseModel]]): list of Pydantic model classes. model_prefix (str): Prefix for the model section. fields_prefix (str): Prefix for the fields section. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -731,7 +729,7 @@ def generate_markdown_documentation( def generate_field_markdown( - field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + field_name: str, field_type: type[Any], model: type[BaseModel], depth=1, documentation_with_field_description=True ) -> str: """ @@ -739,8 +737,8 @@ def generate_field_markdown( Args: field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. + field_type (type[Any]): Type of the field. + model (type[BaseModel]): Pydantic model class. depth (int): Indentation depth in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -798,7 +796,7 @@ def generate_field_markdown( return field_text -def format_json_example(example: dict, depth: int) -> str: +def format_json_example(example: dict[str, Any], depth: int) -> str: """ Format a JSON example into a readable string with indentation. @@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str: def generate_text_documentation( - pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields", documentation_with_field_description=True ) -> str: """ Generate text documentation for a list of Pydantic models. Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + pydantic_models (list[type[BaseModel]]): List of Pydantic model classes. model_prefix (str): Prefix for the model section. fields_prefix (str): Prefix for the fields section. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -885,7 +883,7 @@ def generate_text_documentation( def generate_field_text( - field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + field_name: str, field_type: type[Any], model: type[BaseModel], depth=1, documentation_with_field_description=True ) -> str: """ @@ -893,8 +891,8 @@ def generate_field_text( Args: field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. + field_type (type[Any]): Type of the field. + model (type[BaseModel]): Pydantic model class. depth (int): Indentation depth in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation( pydantic_model_list, grammar_file_path="./generated_grammar.gbnf", documentation_file_path="./generated_grammar_documentation.md", - outer_object_name: str = None, - outer_object_content: str = None, + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation( def generate_gbnf_grammar_and_documentation( pydantic_model_list, - outer_object_name: str = None, - outer_object_content: str = None, + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation( def generate_gbnf_grammar_and_documentation_from_dictionaries( - dictionaries: List[dict], - outer_object_name: str = None, - outer_object_content: str = None, + dictionaries: list[dict[str, Any]], + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries( Generate GBNF grammar and documentation from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing Pydantic models. + dictionaries (list[dict]): List of dictionaries representing Pydantic models. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. model_prefix (str): Prefix for the model section in the documentation. @@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries( return grammar, documentation -def create_dynamic_model_from_function(func: Callable): +def create_dynamic_model_from_function(func: Callable[..., Any]): """ Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. @@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable): sig = inspect.signature(func) # Parse the docstring + assert func.__doc__ is not None docstring = parse(func.__doc__) dynamic_fields = {} @@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable): f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") # Add parameter details to the schema - param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) param_docs.append((param.name, param_doc)) if param.default == inspect.Parameter.empty: default_value = ... @@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable): dynamic_fields[param.name] = ( param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) + dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload] - for param_doc in param_docs: - dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description + for name, param_doc in param_docs: + dynamic_model.model_fields[name].description = param_doc.description dynamic_model.__doc__ = docstring.short_description @@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable): return dynamic_model -def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): +def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]): """ Add a 'run' method to a dynamic Pydantic model, using the provided function. Args: - model (Type[BaseModel]): Dynamic Pydantic model class. + model (type[BaseModel]): Dynamic Pydantic model class. func (Callable): Function to be added as a 'run' method to the model. Returns: - Type[BaseModel]: Pydantic model class with the added 'run' method. + type[BaseModel]: Pydantic model class with the added 'run' method. """ def run_method_wrapper(self): @@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): return model -def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): +def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]): """ Create a list of dynamic Pydantic model classes from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing model structures. + dictionaries (list[dict]): List of dictionaries representing model structures. Returns: - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + list[type[BaseModel]]: List of generated dynamic Pydantic model classes. """ dynamic_models = [] for func in dictionaries: @@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values): return Enum(enum_name, {value: value for value in values}) -def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]: +def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]: """ Convert a dictionary to a Pydantic model class. @@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu model_name (str): Name of the generated Pydantic model. Returns: - Type[BaseModel]: Generated Pydantic model class. + type[BaseModel]: Generated Pydantic model class. """ - fields = {} + fields: dict[str, Any] = {} if "properties" in dictionary: for field_name, field_data in dictionary.get("properties", {}).items(): @@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu if items != {}: array = {"properties": items} array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") - fields[field_name] = (List[array_type], ...) + fields[field_name] = (List[array_type], ...) # type: ignore[valid-type] else: fields[field_name] = (list, ...) elif field_type == "object": diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 2ae046933..f4786157e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,6 +26,7 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 81709e448..cc13b2d63 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,7 +1,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp json.hpp httplib.h) +add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ diff --git a/examples/server/README.md b/examples/server/README.md index fd3034b99..dce4ec47c 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -30,7 +30,8 @@ Command line options: - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - +- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` +- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` ## Build server is build alongside everything else from the root of the project @@ -65,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048 The above command will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. +### Docker: +```bash +docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 + +# or, with CUDA: +docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 +``` + ## Testing with CURL Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp new file mode 100644 index 000000000..bc5db6eef --- /dev/null +++ b/examples/server/oai.hpp @@ -0,0 +1,208 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "json.hpp" +#include "utils.hpp" + +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" + +using json = nlohmann::json; + +inline static json oaicompat_completion_params_parse( + const json &body /* openai api json semantics */) +{ + json llama_params; + + llama_params["__oaicompat"] = true; + + // Map OpenAI parameters to llama.cpp parameters + // + // For parameters that are defined by the OpenAI documentation (e.g. + // temperature), we explicitly specify OpenAI's intended default; we + // need to do that because sometimes OpenAI disagrees with llama.cpp + // + // https://platform.openai.com/docs/api-reference/chat/create + llama_sampling_params default_sparams; + llama_params["model"] = json_value(body, "model", std::string("unknown")); + llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); + llama_params["temperature"] = json_value(body, "temperature", 0.0); + llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); + llama_params["top_p"] = json_value(body, "top_p", 1.0); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); + llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); + llama_params["stream"] = json_value(body, "stream", false); + llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); + llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); + llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); + llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); + + if (body.count("grammar") != 0) { + llama_params["grammar"] = json_value(body, "grammar", json::object()); + } + + // Handle 'stop' field + if (body.contains("stop") && body["stop"].is_string()) { + llama_params["stop"] = json::array({body["stop"].get()}); + } else { + llama_params["stop"] = json_value(body, "stop", json::array()); + } + + // Ensure there is ChatML-specific end sequence among stop words + llama_params["stop"].push_back("<|im_end|>"); + + return llama_params; +} + +inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false) +{ + json result = response.result_json; + + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = + json{{"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", + json{{"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"id", gen_chatcmplid()}}; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +// return value is vector as there is one case where we might need to generate two responses +inline static std::vector format_partial_response_oaicompat(const task_result &response) { + json result = response.result_json; + + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector({response.result_json}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json{{"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({ret}); +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0462fbd24..f58a2acaa 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,8 @@ #include "common.h" #include "llama.h" #include "grammar-parser.h" +#include "utils.hpp" +#include "oai.hpp" #include "../llava/clip.h" @@ -23,17 +25,10 @@ #include #include -#include #include #include #include -#ifndef SERVER_VERBOSE -#define SERVER_VERBOSE 1 -#endif - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" - using json = nlohmann::json; struct server_params @@ -46,197 +41,7 @@ struct server_params int32_t write_timeout = 600; }; -static bool server_verbose = false; - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (server_verbose) \ - { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -json oaicompat_completion_params_parse(const json &body); -std::string format_chatml(std::vector messages); - - -// -// base64 utils (TODO: move to common in the future) -// - -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - -static inline bool is_base64(uint8_t c) -{ - return (isalnum(c) || (c == '+') || (c == '/')); -} - -static std::vector base64_decode(const std::string & encoded_string) -{ - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) - { - char_array_4[i++] = encoded_string[in_]; in_++; - if (i == 4) - { - for (i = 0; i <4; i++) - { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) - { - ret.push_back(char_array_3[i]); - } - i = 0; - } - } - - if (i) - { - for (j = i; j <4; j++) - { - char_array_4[j] = 0; - } - - for (j = 0; j <4; j++) - { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } - - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (j = 0; (j < i - 1); j++) - { - ret.push_back(char_array_3[j]); - } - } - - return ret; -} - -// -// parallel -// - -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded - SERVER_STATE_ERROR // An error occurred, load_model failed -}; - -enum task_type { - TASK_TYPE_COMPLETION, - TASK_TYPE_CANCEL, -}; - -struct task_server { - int id; - int target_id; - task_type type; - json data; - bool infill_mode = false; - bool embedding_mode = false; - int multitask_id = -1; -}; - -struct task_result { - int id; - int multitask_id = -1; - bool stop; - bool error; - json result_json; -}; - -struct task_multi { - int id; - std::set subtasks_remaining{}; - std::vector results{}; -}; - -// TODO: can become bool if we can't find use of more states -enum slot_state -{ - IDLE, - PROCESSING, -}; - -enum slot_command -{ - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params -{ - bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image -{ - int32_t id; - - bool request_encode_image = false; - float * image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8 * img_data; - - std::string prefix_prompt; // before of this image -}; - -// completion token output with probabilities -struct completion_token_output -{ - struct token_prob - { - llama_token tok; - float prob; - }; - - std::vector probs; - llama_token tok; - std::string text_to_send; -}; +bool server_verbose = false; static size_t common_part(const std::vector &a, const std::vector &b) { @@ -292,28 +97,6 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) return ret; } -static void server_log(const char *level, const char *function, int line, - const char *message, const nlohmann::ordered_json &extra) -{ - nlohmann::ordered_json log - { - {"timestamp", time(nullptr)}, - {"level", level}, - {"function", function}, - {"line", line}, - {"message", message}, - }; - - if (!extra.empty()) - { - log.merge_patch(extra); - } - - const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); -} - // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { @@ -355,15 +138,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector -static T json_value(const json &body, const std::string &key, const T &default_value) -{ - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - struct llama_client_slot { int id; @@ -410,6 +184,12 @@ struct llama_client_slot struct llama_sampling_params sparams; llama_sampling_context *ctx_sampling = nullptr; + int32_t ga_i = 0; // group-attention state + int32_t ga_n = 1;// group-attention factor + int32_t ga_w = 512; // group-attention width + + int32_t n_past_se = 0; // self-extend + // multimodal std::vector images; @@ -438,7 +218,8 @@ struct llama_client_slot sent_count = 0; sent_token_probs_index = 0; infill = false; - + ga_i = 0; + n_past_se = 0; generated_token_probs.clear(); for (slot_image & img : images) @@ -491,7 +272,7 @@ struct llama_client_slot } void release() { - if (state == IDLE || state == PROCESSING) + if (state == PROCESSING) { t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; command = RELEASE; @@ -539,7 +320,6 @@ struct llama_server_context bool all_slots_are_idle = false; bool add_bos_token = true; - int32_t id_gen; int32_t n_ctx; // total context for all clients / slots // system prompt @@ -554,13 +334,8 @@ struct llama_server_context // slots / clients std::vector slots; - std::vector queue_tasks; - std::vector queue_results; - std::vector queue_multitasks; - std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks - std::condition_variable condition_tasks; - std::mutex mutex_results; - std::condition_variable condition_results; + llama_server_queue queue_tasks; + llama_server_response queue_results; ~llama_server_context() { @@ -619,8 +394,6 @@ struct llama_server_context } void initialize() { - id_gen = 0; - // create slots all_slots_are_idle = true; @@ -633,9 +406,26 @@ struct llama_server_context slot.id = i; slot.n_ctx = n_ctx_slot; - slot.reset(); LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; + + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT + LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + } + + slot.ga_i = 0; + slot.ga_n = ga_n; + slot.ga_w = ga_w; + + slot.reset(); + slots.push_back(slot); } @@ -891,7 +681,7 @@ struct llama_server_context while ((pos = prompt.find(pattern, pos)) != std::string::npos) { size_t end_prefix = pos; pos += pattern.length(); - size_t end_pos = prompt.find("]", pos); + size_t end_pos = prompt.find(']', pos); if (end_pos != std::string::npos) { std::string image_id = prompt.substr(pos, end_pos - pos); @@ -1183,39 +973,13 @@ struct llama_server_context void send_error(task_server& task, const std::string &error) { LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); - std::unique_lock lock(mutex_results); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; res.stop = false; res.error = true; res.result_json = { { "content", error } }; - queue_results.push_back(res); - condition_results.notify_all(); - } - - void add_multi_task(int id, std::vector& sub_ids) - { - std::lock_guard lock(mutex_tasks); - task_multi multi; - multi.id = id; - std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - condition_tasks.notify_one(); - } - - void update_multi_task(int multitask_id, int subtask_id, task_result& result) - { - std::lock_guard lock(mutex_tasks); - for (auto& multitask : queue_multitasks) - { - if (multitask.id == multitask_id) - { - multitask.subtasks_remaining.erase(subtask_id); - multitask.results.push_back(result); - condition_tasks.notify_one(); - } - } + queue_results.send(res); } json get_model_props() @@ -1261,7 +1025,6 @@ struct llama_server_context void send_partial_response(llama_client_slot &slot, completion_token_output tkn) { - std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1296,13 +1059,11 @@ struct llama_server_context res.result_json["model"] = slot.oaicompat_model; } - queue_results.push_back(res); - condition_results.notify_all(); + queue_results.send(res); } void send_final_response(llama_client_slot &slot) { - std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1351,22 +1112,11 @@ struct llama_server_context res.result_json["model"] = slot.oaicompat_model; } - queue_results.push_back(res); - condition_results.notify_all(); - - // done with results, unlock - lock.unlock(); - - // parent multitask, if any, needs to be updated - if (slot.multitask_id != -1) - { - update_multi_task(slot.multitask_id, slot.task_id, res); - } + queue_results.send(res); } void send_embedding(llama_client_slot &slot) { - std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1393,15 +1143,13 @@ struct llama_server_context {"embedding", embedding }, }; } - queue_results.push_back(res); - condition_results.notify_all(); + queue_results.send(res); } - int request_completion(json data, bool infill, bool embedding, int multitask_id) + void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id) { - std::unique_lock lock(mutex_tasks); task_server task; - task.id = id_gen++; + task.id = task_id; task.target_id = 0; task.data = std::move(data); task.infill_mode = infill; @@ -1412,47 +1160,11 @@ struct llama_server_context // when a completion task's prompt array is not a singleton, we split it into multiple requests if (task.data.count("prompt") && task.data.at("prompt").size() > 1) { - lock.unlock(); // entering new func scope - return split_multiprompt_task(task); + split_multiprompt_task(task_id, task); } // otherwise, it's a single-prompt task, we actually queue it - queue_tasks.push_back(task); - condition_tasks.notify_one(); - return task.id; - } - - task_result next_result(int task_id) - { - while (true) - { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return !queue_results.empty(); - }); - - for (int i = 0; i < (int) queue_results.size(); i++) - { - // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result - if (queue_results[i].multitask_id == task_id) - { - update_multi_task(task_id, queue_results[i].id, queue_results[i]); - queue_results.erase(queue_results.begin() + i); - continue; - } - - if (queue_results[i].id == task_id) - { - assert(queue_results[i].multitask_id == -1); - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // never reached - //return task_result{-1, false, false, {}}; + queue_tasks.post(task); } // for multiple images processing @@ -1525,150 +1237,117 @@ struct llama_server_context void request_cancel(int task_id) { - std::unique_lock lock(mutex_tasks); task_server task; - task.id = id_gen++; task.type = TASK_TYPE_CANCEL; task.target_id = task_id; - queue_tasks.push_back(task); - condition_tasks.notify_one(); + queue_tasks.post(task); } - int split_multiprompt_task(task_server& multiprompt_task) + void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) { int prompt_count = multiprompt_task.data.at("prompt").size(); assert(prompt_count > 1); - int multitask_id = id_gen++; + // generate all the ID for subtask std::vector subtask_ids(prompt_count); for (int i = 0; i < prompt_count; i++) + { + subtask_ids[i] = queue_tasks.get_new_id(); + } + + // queue up the multitask so we can track its subtask progression + queue_tasks.add_multitask(multitask_id, subtask_ids); + + // add subtasks + for (int i = 0; i < prompt_count; i++) { json subtask_data = multiprompt_task.data; subtask_data["prompt"] = subtask_data["prompt"][i]; // subtasks inherit everything else (infill mode, embedding mode, etc.) - subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id); + request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id); } - - // queue up the multitask so we can track its subtask progression - add_multi_task(multitask_id, subtask_ids); - return multitask_id; } - void process_tasks() + void process_single_task(task_server& task) { - std::unique_lock lock(mutex_tasks); - std::vector deferred_tasks; - while (!queue_tasks.empty()) + switch (task.type) { - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - switch (task.type) - { - case TASK_TYPE_COMPLETION: { - llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); - if (slot == nullptr) - { - // if no slot is available, we defer this task for processing later - deferred_tasks.push_back(task); - break; - } - - if (task.data.contains("system_prompt")) - { - if (!all_slots_are_idle) { - send_error(task, "system prompt can only be updated when all slots are idle"); - break; - } - process_system_prompt_data(task.data["system_prompt"]); - - // reset cache_tokens for all slots - for (llama_client_slot &slot : slots) - { - slot.cache_tokens.clear(); - } - } - - slot->reset(); - - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; - slot->multitask_id = task.multitask_id; - - if (!launch_slot_with_data(slot, task.data)) - { - // send error result - send_error(task, "internal_error"); - break; - } - } break; - case TASK_TYPE_CANCEL: { // release slot linked with the task id - for (auto & slot : slots) - { - if (slot.task_id == task.target_id) - { - slot.release(); - break; - } - } - } break; - } - } - - // add all the deferred tasks back the the queue - for (task_server &task : deferred_tasks) - { - queue_tasks.push_back(task); - } - - // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue - std::vector agg_results; - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) - { - if (queue_iterator->subtasks_remaining.empty()) - { - // all subtasks done == multitask is done - task_result aggregate_result; - aggregate_result.id = queue_iterator->id; - aggregate_result.stop = true; - aggregate_result.error = false; - - // collect json results into one json result - std::vector result_jsons; - for (auto& subres : queue_iterator->results) + case TASK_TYPE_COMPLETION: { + llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + if (slot == nullptr) { - result_jsons.push_back(subres.result_json); - aggregate_result.error = aggregate_result.error && subres.error; + // if no slot is available, we defer this task for processing later + LOG_VERBOSE("no slot is available", {}); + queue_tasks.defer(task); + break; } - aggregate_result.result_json = json{ "results", result_jsons }; + if (task.data.contains("system_prompt")) + { + if (!all_slots_are_idle) { + send_error(task, "system prompt can only be updated when all slots are idle"); + break; + } + process_system_prompt_data(task.data["system_prompt"]); - agg_results.push_back(aggregate_result); + // reset cache_tokens for all slots + for (llama_client_slot &slot : slots) + { + slot.cache_tokens.clear(); + } + } - condition_results.notify_all(); + slot->reset(); - queue_iterator = queue_multitasks.erase(queue_iterator); - } - else - { - ++queue_iterator; - } + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; + slot->multitask_id = task.multitask_id; + + if (!launch_slot_with_data(slot, task.data)) + { + // send error result + send_error(task, "internal_error"); + break; + } + } break; + case TASK_TYPE_CANCEL: { // release slot linked with the task id + for (auto & slot : slots) + { + if (slot.task_id == task.target_id) + { + slot.release(); + break; + } + } + } break; + case TASK_TYPE_NEXT_RESPONSE: { + // do nothing + } break; } + } - // done with tasks, unlock - lock.unlock(); + void on_finish_multitask(task_multi& multitask) + { + // all subtasks done == multitask is done + task_result result; + result.id = multitask.id; + result.stop = true; + result.error = false; - // copy aggregate results of complete multi-tasks to the results queue - std::lock_guard lock_results(mutex_results); - queue_results.insert(queue_results.end(), agg_results.begin(), agg_results.end()); + // collect json results into one json result + std::vector result_jsons; + for (auto& subres : multitask.results) + { + result_jsons.push_back(subres.result_json); + result.error = result.error && subres.error; + } + result.result_json = json{ { "results", result_jsons } }; + queue_results.send(result); } bool update_slots() { - // attend tasks - process_tasks(); - if (system_need_update) { LOG_TEE("updating system prompt\n"); @@ -1684,40 +1363,45 @@ struct llama_server_context LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); kv_cache_clear(); } - std::unique_lock lock(mutex_tasks); - condition_tasks.wait(lock, [&]{ - return !queue_tasks.empty(); - }); + return true; + } else { + task_server task; + task.type = TASK_TYPE_NEXT_RESPONSE; + task.target_id = -1; + queue_tasks.post(task); } for (llama_client_slot &slot : slots) { - if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) + if (slot.ga_n == 1) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; - - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + // Shift context + const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_discard = n_left / 2; + + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); + + for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + + slot.n_past -= n_discard; + + slot.truncated = true; + + LOG_VERBOSE("context shift", { + { "n_ctx", n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + }); } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - }); } } @@ -1732,6 +1416,7 @@ struct llama_server_context slot.t_last_used = ggml_time_us(); LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + queue_tasks.notify_slot_changed(); continue; } @@ -1743,7 +1428,8 @@ struct llama_server_context slot.i_batch = batch.n_tokens; - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true); + const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); slot.n_past += 1; } @@ -1841,6 +1527,8 @@ struct llama_server_context llama_sampling_reset(slot.ctx_sampling); slot.n_past = 0; + slot.n_past_se = 0; + slot.ga_i = 0; slot.num_prompt_tokens_processed = slot.num_prompt_tokens; } else @@ -1854,6 +1542,25 @@ struct llama_server_context slot.n_past = common_part(slot.cache_tokens, prompt_tokens); slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; + if (slot.ga_n != 1) + { + int ga_i = 0; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; + int32_t slot_npast = 0; + for (int k = 0; k < slot.n_past; ++k) + { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + slot_npast++; + } + slot.n_past_se = slot_npast; + slot.ga_i = ga_i; + } + LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } @@ -1868,6 +1575,10 @@ struct llama_server_context // we have to evaluate at least 1 token to generate logits. LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); slot.n_past--; + if (slot.ga_i > 0) + { + slot.n_past_se--; + } } LOG_VERBOSE("prompt ingested", { @@ -1880,9 +1591,22 @@ struct llama_server_context // process the prefix of first image std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; + int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + int ga_i = slot.ga_i; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) { - llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false); + if (slot.ga_n != 1) + { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + } + llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); + slot_npast += 1; } if (has_images && !ingest_images(slot, n_batch)) @@ -1912,6 +1636,36 @@ struct llama_server_context for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + for (auto & slot : slots) + { + if (slot.ga_n != 1) + { + // context extension via Self-Extend + while (slot.n_past_se >= slot.ga_i + slot.ga_w) + { + const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; + const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); + const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; + + LOG_TEE("\n"); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + + llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); + llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); + + slot.n_past_se -= bd; + + slot.ga_i += slot.ga_w / slot.ga_n; + + LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + } + slot.n_past_se += n_tokens; + } + } llama_batch batch_view = { n_tokens, @@ -1925,6 +1679,7 @@ struct llama_server_context }; const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { if (n_batch == 1 || ret < 0) @@ -1997,6 +1752,10 @@ struct llama_server_context } return true; } + + void run_on_all_tasks_finished() { + update_slots(); + } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, @@ -2066,6 +1825,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); + printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf("\n"); } @@ -2251,6 +2012,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } + else if (arg == "--grp-attn-n" || arg == "-gan") + { + if (++i >= argc) { + invalid_param = true; + break; + } + + params.grp_attn_n = std::stoi(argv[i]); + } + else if (arg == "--grp-attn-w" || arg == "-gaw") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + + params.grp_attn_w = std::stoi(argv[i]); + } else if (arg == "--threads-batch" || arg == "-tb") { if (++i >= argc) @@ -2541,239 +2321,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } -static std::string random_string() -{ - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() -{ - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); -} - -std::string format_chatml(std::vector messages) -{ - std::ostringstream chatml_msgs; - - for (auto it = messages.begin(); it != messages.end(); ++it) { - chatml_msgs << "<|im_start|>" - << json_value(*it, "role", std::string("user")) << '\n'; - chatml_msgs << json_value(*it, "content", std::string("")) - << "<|im_end|>\n"; - } - - chatml_msgs << "<|im_start|>assistant" << '\n'; - - return chatml_msgs.str(); -} - /* llama.cpp completion api semantics */ -json oaicompat_completion_params_parse( - const json &body /* openai api json semantics */) -{ - json llama_params; - - llama_params["__oaicompat"] = true; - - // Map OpenAI parameters to llama.cpp parameters - // - // For parameters that are defined by the OpenAI documentation (e.g. - // temperature), we explicitly specify OpenAI's intended default; we - // need to do that because sometimes OpenAI disagrees with llama.cpp - // - // https://platform.openai.com/docs/api-reference/chat/create - llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' - llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); - llama_params["temperature"] = json_value(body, "temperature", 0.0); - llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); - llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); - llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); - llama_params["stream"] = json_value(body, "stream", false); - llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); - llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); - llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); - llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); - llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); - llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); - llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); - llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); - - if (body.count("grammar") != 0) { - llama_params["grammar"] = json_value(body, "grammar", json::object()); - } - - // Handle 'stop' field - if (body.contains("stop") && body["stop"].is_string()) { - llama_params["stop"] = json::array({body["stop"].get()}); - } else { - llama_params["stop"] = json_value(body, "stop", json::array()); - } - - // Ensure there is ChatML-specific end sequence among stop words - llama_params["stop"].push_back("<|im_end|>"); - - return llama_params; -} - -static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false) -{ - json result = response.result_json; - - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = - json{{"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", - json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, - {"id", gen_chatcmplid()}}; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(const task_result &response) { - json result = response.result_json; - - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({response.result_json}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json{{"choices", choices}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({ret}); -} - static json format_partial_response( llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs ) { @@ -3069,10 +2617,12 @@ int main(int argc, char **argv) return; } json data = json::parse(req.body); - const int task_id = llama.request_completion(data, false, false, -1); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, false, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; - task_result result = llama.next_result(task_id); + task_result result = llama.queue_results.recv(task_id); if (!result.error && result.stop) { res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } @@ -3080,14 +2630,14 @@ int main(int argc, char **argv) { res.status = 404; res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); - return; } + llama.queue_results.remove_waiting_task_id(task_id); } else { const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) { while (true) { - task_result result = llama.next_result(task_id); + task_result result = llama.queue_results.recv(task_id); if (!result.error) { const std::string str = "data: " + @@ -3098,6 +2648,7 @@ int main(int argc, char **argv) }); if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); return false; } if (result.stop) { @@ -3113,11 +2664,14 @@ int main(int argc, char **argv) }); if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); return false; } break; } } + + llama.queue_results.remove_waiting_task_id(task_id); sink.done(); return true; }; @@ -3126,6 +2680,7 @@ int main(int argc, char **argv) { // cancel llama.request_cancel(task_id); + llama.queue_results.remove_waiting_task_id(task_id); }; res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); @@ -3162,11 +2717,13 @@ int main(int argc, char **argv) } json data = oaicompat_completion_params_parse(json::parse(req.body)); - const int task_id = llama.request_completion(data, false, false, -1); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, false, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; - task_result result = llama.next_result(task_id); + task_result result = llama.queue_results.recv(task_id); if (!result.error && result.stop) { json oaicompat_result = format_final_response_oaicompat(data, result); @@ -3177,12 +2734,12 @@ int main(int argc, char **argv) } else { res.status = 500; res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); - return; } + llama.queue_results.remove_waiting_task_id(task_id); } else { const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) { while (true) { - task_result llama_result = llama.next_result(task_id); + task_result llama_result = llama.queue_results.recv(task_id); if (!llama_result.error) { std::vector result_array = format_partial_response_oaicompat( llama_result); @@ -3195,6 +2752,7 @@ int main(int argc, char **argv) "\n\n"; LOG_VERBOSE("data stream", {{"to_send", str}}); if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); return false; } } @@ -3210,18 +2768,21 @@ int main(int argc, char **argv) "\n\n"; LOG_VERBOSE("data stream", {{"to_send", str}}); if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); return false; } break; } } sink.done(); + llama.queue_results.remove_waiting_task_id(task_id); return true; }; auto on_complete = [task_id, &llama](bool) { // cancel request llama.request_cancel(task_id); + llama.queue_results.remove_waiting_task_id(task_id); }; res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); @@ -3235,10 +2796,12 @@ int main(int argc, char **argv) return; } json data = json::parse(req.body); - const int task_id = llama.request_completion(data, true, false, -1); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, true, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; - task_result result = llama.next_result(task_id); + task_result result = llama.queue_results.recv(task_id); if (!result.error && result.stop) { res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); @@ -3247,13 +2810,13 @@ int main(int argc, char **argv) { res.status = 404; res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); - return; } + llama.queue_results.remove_waiting_task_id(task_id); } else { const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) { while (true) { - task_result result = llama.next_result(task_id); + task_result result = llama.queue_results.recv(task_id); if (!result.error) { const std::string str = "data: " + @@ -3264,6 +2827,7 @@ int main(int argc, char **argv) }); if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); return false; } if (result.stop) @@ -3277,8 +2841,8 @@ int main(int argc, char **argv) } } + llama.queue_results.remove_waiting_task_id(task_id); sink.done(); - return true; }; @@ -3352,23 +2916,46 @@ int main(int argc, char **argv) image_data = ""; } - const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1); - task_result result = llama.next_result(task_id); + // create and queue the task + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1); + + // get the result + task_result result = llama.queue_results.recv(task_id); + llama.queue_results.remove_waiting_task_id(task_id); + + // send the result return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!? // "Bus error: 10" - this is on macOS, it does not crash on Linux //std::thread t2([&]() - { + /*{ bool running = true; while (running) { running = llama.update_slots(); } - } + }*/ //); + llama.queue_tasks.on_new_task(std::bind( + &llama_server_context::process_single_task, &llama, std::placeholders::_1)); + llama.queue_tasks.on_finish_multitask(std::bind( + &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); + llama.queue_tasks.on_all_tasks_finished(std::bind( + &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_results.on_multitask_update(std::bind( + &llama_server_queue::update_multitask, + &llama.queue_tasks, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3 + )); + llama.queue_tasks.start_loop(); + t.join(); llama_backend_free(); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp new file mode 100644 index 000000000..70cce0721 --- /dev/null +++ b/examples/server/utils.hpp @@ -0,0 +1,508 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "json.hpp" + +#include "../llava/clip.h" + +using json = nlohmann::json; + +extern bool server_verbose; + +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif + +#if SERVER_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (server_verbose) \ + { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +// +// parallel +// + +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed +}; + +enum task_type { + TASK_TYPE_COMPLETION, + TASK_TYPE_CANCEL, + TASK_TYPE_NEXT_RESPONSE +}; + +struct task_server { + int id = -1; // to be filled by llama_server_queue + int target_id; + task_type type; + json data; + bool infill_mode = false; + bool embedding_mode = false; + int multitask_id = -1; +}; + +struct task_result { + int id; + int multitask_id = -1; + bool stop; + bool error; + json result_json; +}; + +struct task_multi { + int id; + std::set subtasks_remaining{}; + std::vector results{}; +}; + +// TODO: can become bool if we can't find use of more states +enum slot_state +{ + IDLE, + PROCESSING, +}; + +enum slot_command +{ + NONE, + LOAD_PROMPT, + RELEASE, +}; + +struct slot_params +{ + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt + + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict + + std::vector antiprompt; + + json input_prefix; + json input_suffix; +}; + +struct slot_image +{ + int32_t id; + + bool request_encode_image = false; + float * image_embedding = nullptr; + int32_t image_tokens = 0; + + clip_image_u8 * img_data; + + std::string prefix_prompt; // before of this image +}; + +// completion token output with probabilities +struct completion_token_output +{ + struct token_prob + { + llama_token tok; + float prob; + }; + + std::vector probs; + llama_token tok; + std::string text_to_send; +}; + +static inline void server_log(const char *level, const char *function, int line, + const char *message, const nlohmann::ordered_json &extra) +{ + nlohmann::ordered_json log + { + {"timestamp", time(nullptr)}, + {"level", level}, + {"function", function}, + {"line", line}, + {"message", message}, + }; + + if (!extra.empty()) + { + log.merge_patch(extra); + } + + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); +} + +// +// server utils +// + +template +static T json_value(const json &body, const std::string &key, const T &default_value) +{ + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; +} + +inline std::string format_chatml(std::vector messages) +{ + std::ostringstream chatml_msgs; + + for (auto it = messages.begin(); it != messages.end(); ++it) { + chatml_msgs << "<|im_start|>" + << json_value(*it, "role", std::string("user")) << '\n'; + chatml_msgs << json_value(*it, "content", std::string("")) + << "<|im_end|>\n"; + } + + chatml_msgs << "<|im_start|>assistant" << '\n'; + + return chatml_msgs.str(); +} + +// +// work queue utils +// + +struct llama_server_queue { + int id = 0; + std::mutex mutex_tasks; + // queues + std::vector queue_tasks; + std::vector queue_tasks_deferred; + std::vector queue_multitasks; + std::condition_variable condition_tasks; + // callback functions + std::function callback_new_task; + std::function callback_finish_multitask; + std::function callback_all_task_finished; + + // Add a new task to the end of the queue + int post(task_server task) { + std::unique_lock lock(mutex_tasks); + if (task.id == -1) { + task.id = id++; + } + queue_tasks.push_back(std::move(task)); + condition_tasks.notify_one(); + return task.id; + } + + // Add a new task, but defer until one slot is available + void defer(task_server task) { + std::unique_lock lock(mutex_tasks); + queue_tasks_deferred.push_back(std::move(task)); + } + + // Get the next id for creating anew task + int get_new_id() { + std::unique_lock lock(mutex_tasks); + return id++; + } + + // Register function to process a new task + void on_new_task(std::function callback) { + callback_new_task = callback; + } + + // Register function to process a multitask + void on_finish_multitask(std::function callback) { + callback_finish_multitask = callback; + } + + // Register the function to be called when the batch of tasks is finished + void on_all_tasks_finished(std::function callback) { + callback_all_task_finished = callback; + } + + // Call when the state of one slot is changed + void notify_slot_changed() { + // move deferred tasks back to main loop + std::unique_lock lock(mutex_tasks); + for (auto & task : queue_tasks_deferred) { + queue_tasks.push_back(std::move(task)); + } + queue_tasks_deferred.clear(); + } + + // Start the main loop. This call is blocking + [[noreturn]] + void start_loop() { + while (true) { + // new task arrived + LOG_VERBOSE("have new task", {}); + { + while (true) + { + std::unique_lock lock(mutex_tasks); + if (queue_tasks.empty()) { + lock.unlock(); + break; + } + task_server task = queue_tasks.front(); + queue_tasks.erase(queue_tasks.begin()); + lock.unlock(); + LOG_VERBOSE("callback_new_task", {}); + callback_new_task(task); + } + LOG_VERBOSE("callback_all_task_finished", {}); + // process and update all the multitasks + auto queue_iterator = queue_multitasks.begin(); + while (queue_iterator != queue_multitasks.end()) + { + if (queue_iterator->subtasks_remaining.empty()) + { + // all subtasks done == multitask is done + task_multi current_multitask = *queue_iterator; + callback_finish_multitask(current_multitask); + // remove this multitask + queue_iterator = queue_multitasks.erase(queue_iterator); + } + else + { + ++queue_iterator; + } + } + // all tasks in the current loop is finished + callback_all_task_finished(); + } + LOG_VERBOSE("wait for new task", {}); + // wait for new task + { + std::unique_lock lock(mutex_tasks); + if (queue_tasks.empty()) { + condition_tasks.wait(lock, [&]{ + return !queue_tasks.empty(); + }); + } + } + } + } + + // + // functions to manage multitasks + // + + // add a multitask by specifying the id of all subtask (subtask is a task_server) + void add_multitask(int multitask_id, std::vector& sub_ids) + { + std::lock_guard lock(mutex_tasks); + task_multi multi; + multi.id = multitask_id; + std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); + queue_multitasks.push_back(multi); + } + + // updatethe remaining subtasks, while appending results to multitask + void update_multitask(int multitask_id, int subtask_id, task_result& result) + { + std::lock_guard lock(mutex_tasks); + for (auto& multitask : queue_multitasks) + { + if (multitask.id == multitask_id) + { + multitask.subtasks_remaining.erase(subtask_id); + multitask.results.push_back(result); + } + } + } +}; + +struct llama_server_response { + typedef std::function callback_multitask_t; + callback_multitask_t callback_update_multitask; + // for keeping track of all tasks waiting for the result + std::set waiting_task_ids; + // the main result queue + std::vector queue_results; + std::mutex mutex_results; + std::condition_variable condition_results; + + void add_waiting_task_id(int task_id) { + std::unique_lock lock(mutex_results); + waiting_task_ids.insert(task_id); + } + + void remove_waiting_task_id(int task_id) { + std::unique_lock lock(mutex_results); + waiting_task_ids.erase(task_id); + } + + // This function blocks the thread until there is a response for this task_id + task_result recv(int task_id) { + while (true) + { + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return !queue_results.empty(); + }); + LOG_VERBOSE("condition_results unblock", {}); + + for (int i = 0; i < (int) queue_results.size(); i++) + { + if (queue_results[i].id == task_id) + { + assert(queue_results[i].multitask_id == -1); + task_result res = queue_results[i]; + queue_results.erase(queue_results.begin() + i); + return res; + } + } + } + + // should never reach here + } + + // Register the function to update multitask + void on_multitask_update(callback_multitask_t callback) { + callback_update_multitask = callback; + } + + // Send a new result to a waiting task_id + void send(task_result result) { + std::unique_lock lock(mutex_results); + LOG_VERBOSE("send new result", {}); + for (auto& task_id : waiting_task_ids) { + // LOG_TEE("waiting task id %i \n", task_id); + // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result + if (result.multitask_id == task_id) + { + LOG_VERBOSE("callback_update_multitask", {}); + callback_update_multitask(task_id, result.id, result); + continue; + } + + if (result.id == task_id) + { + LOG_VERBOSE("queue_results.push_back", {}); + queue_results.push_back(result); + condition_results.notify_one(); + return; + } + } + } +}; + +// +// base64 utils (TODO: move to common in the future) +// + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +static inline bool is_base64(uint8_t c) +{ + return (isalnum(c) || (c == '+') || (c == '/')); +} + +static inline std::vector base64_decode(const std::string & encoded_string) +{ + int i = 0; + int j = 0; + int in_ = 0; + + int in_len = encoded_string.size(); + + uint8_t char_array_4[4]; + uint8_t char_array_3[3]; + + std::vector ret; + + while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) + { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i == 4) + { + for (i = 0; i <4; i++) + { + char_array_4[i] = base64_chars.find(char_array_4[i]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + { + ret.push_back(char_array_3[i]); + } + i = 0; + } + } + + if (i) + { + for (j = i; j <4; j++) + { + char_array_4[j] = 0; + } + + for (j = 0; j <4; j++) + { + char_array_4[j] = base64_chars.find(char_array_4[j]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (j = 0; (j < i - 1); j++) + { + ret.push_back(char_array_3[j]); + } + } + + return ret; +} + +// +// random string / id +// + +static std::string random_string() +{ + static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + std::random_device rd; + std::mt19937 generator(rd()); + + std::string result(32, ' '); + + for (int i = 0; i < 32; ++i) { + result[i] = str[generator() % str.size()]; + } + + return result; +} + +static std::string gen_chatcmplid() +{ + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); +} diff --git a/flake.lock b/flake.lock index cd532ef4f..1b253cb44 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1705133751, - "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=", + "lastModified": 1705677747, + "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d", + "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index ec62c773a..a776ba024 100644 --- a/flake.nix +++ b/flake.nix @@ -1,3 +1,17 @@ +# The flake interface to llama.cpp's Nix expressions. The flake is used as a +# more discoverable entry-point, as well as a way to pin the dependencies and +# expose default outputs, including the outputs built by the CI. + +# For more serious applications involving some kind of customization you may +# want to consider consuming the overlay, or instantiating `llamaPackages` +# directly: +# +# ```nix +# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }` +# ``` + +# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition +# of the relation between Nix and the Nix Flakes. { description = "Port of Facebook's LLaMA model in C/C++"; diff --git a/ggml-alloc.c b/ggml-alloc.c index 89b85d348..95a93c99d 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; } else { - fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", - __func__, size, max_avail); + fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n", + __func__, tensor->name, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); return; } @@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) { } size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { - return alloc->max_size; + // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail + // to avoid this, we add a 10% margin to the buffer size + return alloc->max_size + alloc->max_size/10; } // graph allocator diff --git a/ggml-backend.c b/ggml-backend.c index ef518dae0..3fff5fc87 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { - return buft->iface.get_alloc_size(buft, tensor); + size_t size = buft->iface.get_alloc_size(buft, tensor); + assert(size >= ggml_nbytes(tensor)); + return size; } return ggml_nbytes(tensor); } @@ -1191,6 +1193,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g ggml_tallocr_t src_allocr = node_allocr(src); GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now if (src_allocr != node_allocr) { + // create a copy of the input in the split's backend + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + SET_CAUSE(tensor_copy, "4.cpy"); + + int n_inputs = sched->splits[cur_split].n_inputs++; + GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = src; + } + node->src[j] = sched->node_copies[id][cur_backend_id]; + +#if 0 // check if the input is already in the split bool found = false; for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { @@ -1206,19 +1226,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); sched->splits[cur_split].inputs[n_inputs] = src; } - - // create a copy of the input in the split's backend - size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); - - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; - SET_CAUSE(tensor_copy, "4.cpy"); - } - node->src[j] = sched->node_copies[id][cur_backend_id]; +#endif } } } @@ -1333,7 +1341,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { ggml_backend_graph_compute(split_backend, &split->graph); - //ggml_backend_synchronize(split_backend); // necessary to measure compute time + //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ec3837fb8..0d599e20a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -13,6 +13,10 @@ #include #include +// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string +#define STRINGIZE_IMPL(...) #__VA_ARGS__ +#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) + #if defined(GGML_USE_HIPBLAS) #include #include @@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; [[noreturn]] -static __device__ void bad_arch() { - printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"); +static __device__ void no_device_code( + const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", + file_name, line, function_name, arch); + (void) arch_list; +#else + printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", + file_name, line, function_name, arch, arch_list); +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) __trap(); - (void) bad_arch; // suppress unused function warning + (void) no_device_code; // suppress unused function warning } +#ifdef __CUDA_ARCH__ +#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) +#else +#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#endif // __CUDA_ARCH__ + static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { return a; #else (void) a; - bad_arch(); + NO_DEVICE_CODE; #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } @@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { return x; #else (void) x; - bad_arch(); + NO_DEVICE_CODE; #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } @@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h } #else (void) vx; (void) y; (void) k; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_PASCAL } @@ -2452,7 +2471,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2489,7 +2508,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2524,7 +2543,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2569,7 +2588,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2590,7 +2609,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp return d8_0*d8_1 * sumi; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2620,7 +2639,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( return dm2f.x*sumf_d - dm2f.y*sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( return d3 * sumf; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( return d3*d8 * sumi; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( return dm5f.x*sumf_d - dm5f.y*sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( return d*sumf; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return dall * sumf_d - dmin * sumf_m; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; #else - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -4264,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( q8 += 8; aux32 >>= 7; } - const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f; + const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f; return d * sumi; #else // iqs is 0...15 @@ -4275,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f; + const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f; const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; const int8_t * q8 = bq8_1[ib32].qs + 16*il; @@ -4320,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( } q8 += 8; } - const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f; + const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else assert(false); @@ -4501,7 +4520,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_0_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4570,7 +4589,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_1_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4637,7 +4656,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_0_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4704,7 +4723,7 @@ mul_mat_q5_1( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_1_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4771,7 +4790,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q8_0_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4838,7 +4857,7 @@ mul_mat_q2_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q2_K_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4907,7 +4926,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q3_K_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4976,7 +4995,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_K_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5043,7 +5062,7 @@ mul_mat_q5_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_K_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5112,7 +5131,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q6_K_q8_1_mul_mat; - bad_arch(); + NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds } #else (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; - bad_arch(); + NO_DEVICE_CODE; #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } @@ -9771,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s // TODO: mmq/mmv support #endif - const int64_t nb11 = src1->nb[1]; - const int64_t nb1 = dst->nb[1]; + const size_t nb11 = src1->nb[1]; + const size_t nb1 = dst->nb[1]; const struct ggml_tensor * ids = src0; const int32_t id = ((int32_t *) dst->op_params)[0]; @@ -10285,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t if (ggml_is_quantized(tensor->type)) { // initialize padding to 0 to avoid possible NaN values - int64_t row_low = 0; - int64_t row_high = ggml_nrows(tensor); - int64_t nrows_split = row_high - row_low; - - size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t original_size = ggml_nbytes(tensor); size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { - CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0])); + CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } } @@ -10396,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend } GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - int64_t row_low = 0; - int64_t row_high = ggml_nrows(tensor); - int64_t nrows_split = row_high - row_low; - - size_t size = ggml_nbytes_split(tensor, nrows_split); - + size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; if (ggml_is_quantized(tensor->type)) { diff --git a/ggml-metal.m b/ggml-metal.m index cf7880c82..eabc16f41 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -26,15 +26,6 @@ #define GGML_METAL_MAX_KERNELS 256 -struct ggml_metal_buffer { - const char * name; - - void * data; - size_t size; - - id metal; -}; - struct ggml_metal_kernel { id function; id pipeline; @@ -175,9 +166,6 @@ struct ggml_metal_context { dispatch_queue_t d_queue; - int n_buffers; - struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; - struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS]; bool support_simdgroup_reduction; @@ -245,24 +233,20 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { // Show all the Metal device instances in the system NSArray * devices = MTLCopyAllDevices(); for (id device in devices) { - NSString * s = [device name]; - GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); } [devices release]; // since it was created by a *Copy* C method #endif // Pick and show default Metal device id device = MTLCreateSystemDefaultDevice(); - NSString * s = [device name]; - GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; - ctx->n_buffers = 0; - ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); // load library @@ -543,10 +527,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { static void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); - for (int i = 0; i < ctx->n_buffers; ++i) { - [ctx->buffers[i].metal release]; - } - for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { if (ctx->kernels[i].pipeline) { [ctx->kernels[i].pipeline release]; @@ -589,51 +569,30 @@ struct ggml_backend_metal_buffer_context { // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { +static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; - // compatibility with ggml-backend - if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) { - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; - - // find the view that contains the tensor fully - for (int i = 0; i < buf_ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - - //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); - if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { - *offs = (size_t) ioffs; - - //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - - return buf_ctx->buffers[i].metal; - } - } - - GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); - - return nil; - } + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; // find the view that contains the tensor fully - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); - if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { *offs = (size_t) ioffs; - //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - return ctx->buffers[i].metal; + return buf_ctx->buffers[i].metal; } } - GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); return nil; } @@ -681,7 +640,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - return ctx->support_simdgroup_reduction; + return ctx->support_simdgroup_reduction && + (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32); case GGML_OP_CPY: case GGML_OP_DUP: case GGML_OP_CONT: @@ -826,9 +786,9 @@ static bool ggml_metal_graph_compute( const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; + id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; + id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { @@ -1610,7 +1570,7 @@ static bool ggml_metal_graph_compute( struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + id id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur); [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; } @@ -1755,7 +1715,7 @@ static bool ggml_metal_graph_compute( struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + id id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur); [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; } @@ -2189,9 +2149,9 @@ static bool ggml_metal_graph_compute( size_t offs_src3 = 0; GGML_ASSERT(src2); - id id_src2 = ggml_metal_get_buffer(ctx, src2, &offs_src2); + id id_src2 = ggml_metal_get_buffer(src2, &offs_src2); - id id_src3 = src3 ? ggml_metal_get_buffer(ctx, src3, &offs_src3) : nil; + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); const int64_t ne31 = src3 ? src3->ne[1] : 0; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 2bb93638f..bf9ad964f 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, dst[row] = tmp[0]; } } - ); @@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float dst[row] = tmp[0]; } } + ); @@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y } ); +std::string add_template = MULTILINE_QUOTE( +__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) { + const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); + + if (i >= get_global_size(0)) { + return; + } + + dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky]; +} +); + #define CL_CHECK(err) \ do { \ cl_int err_ = (err); \ @@ -878,6 +890,7 @@ static std::string generate_kernels() { } src << mul_kernel << '\n'; } + src << add_template << '\n'; return src.str(); } @@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; static cl_kernel mul_f32_cl; +static cl_kernel add_f32_cl; static bool fp16_support; static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { @@ -1100,9 +1114,10 @@ void ggml_cl_init(void) { char *ext_buffer = (char *)alloca(ext_str_size + 1); clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated + // Disabled due to faulty outputs // Check if ext_buffer contains cl_khr_fp16 - fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; - fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); + fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL; + // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); cl_context_properties properties[] = { (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 @@ -1150,6 +1165,8 @@ void ggml_cl_init(void) { // mul kernel CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); + + CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err)); } static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { @@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src ggml_cl_mul_f32(src0, src1, dst); } +static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + size_t x_size; + size_t d_size; + + cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 + cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. + cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst + + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + cl_event ev; + + // copy src0 to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); + + const int64_t i13 = i03%ne13; + const int64_t i12 = i02%ne12; + const int i1 = i13*ne12*ne11 + i12*ne11; + + cl_int x_offset = 0; + cl_int y_offset = i1*ne10; + cl_int d_offset = 0; + + size_t global = ne00 * ne01; + cl_int ky = ne10 * ne11; + + CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X)); + CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset)); + CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y)); + CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset)); + CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D)); + CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset)); + CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky)); + CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); + + CL_CHECK(clReleaseEvent(ev)); + CL_CHECK(clFinish(queue)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); + } + } + ggml_cl_pool_free(d_X, x_size); + ggml_cl_pool_free(d_D, d_size); +} + +void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cl_add_f32(src0, src1, dst); +} + static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; diff --git a/ggml-opencl.h b/ggml-opencl.h index 919b00d63..257a6be6a 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -10,6 +10,7 @@ extern "C" { GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml.c b/ggml.c index 5e515c03f..6bba840d9 100644 --- a/ggml.c +++ b/ggml.c @@ -1478,6 +1478,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } +// TODO: optimize performance +inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; @@ -1838,9 +1841,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "GELU", "GELU_QUICK", "SILU", + "HARDSWISH", + "HARDSIGMOID", }; -static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10"); +static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); @@ -4007,6 +4012,20 @@ struct ggml_tensor * ggml_silu_back( return result; } +// ggml hardswish +struct ggml_tensor * ggml_hardswish( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH); +} + +// ggml hardsigmoid +struct ggml_tensor * ggml_hardsigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( @@ -5408,6 +5427,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( return result; } +// ggml_conv_depthwise +struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW] + + struct ggml_tensor * result = + ggml_mul_mat(ctx, + ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] + + return result; +} // ggml_conv_2d // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] @@ -7278,6 +7322,17 @@ static void ggml_compute_forward_add_f32( const int ith = params->ith; const int nth = params->nth; +#ifdef GGML_USE_CLBLAST + if (src1->backend == GGML_BACKEND_GPU) { + // TODO: OpenCL kernel support full broadcast + GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); + if (ith == 0) { + ggml_cl_add(src0, src1, dst); + } + return; + } +#endif + const int nr = ggml_nrows(src0); GGML_TENSOR_BINARY_OP_LOCALS @@ -7558,7 +7613,12 @@ static void ggml_compute_forward_add( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add_f32(params, src0, src1, dst); + if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } } break; case GGML_TYPE_F16: { @@ -7879,6 +7939,9 @@ static void ggml_compute_forward_acc_f32( bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { + if (params->ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( @@ -9448,6 +9511,87 @@ static void ggml_compute_forward_silu_back( } } + +static void ggml_compute_forward_hardswish_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_hardswish_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} +static void ggml_compute_forward_hardswish( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardswish_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_hardsigmoid_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_hardsigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_hardsigmoid( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardsigmoid_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -9940,11 +10084,30 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(dst)) { - if (params->ith != 0) { - return; - } + const int64_t ne_plane = ne01*ne00; + const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); + UNUSED(desired_wsize); if (params->type == GGML_TASK_INIT) { + if (type != GGML_TYPE_F32) { + assert(params->wsize >= desired_wsize); + // parallelize by src0 rows + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; + ggml_to_float_t const to_float = type_traits[type].to_float; + + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); + } + } + } + } return; } @@ -9952,9 +10115,14 @@ static void ggml_compute_forward_mul_mat( return; } + // perform sgemm, parallelization controlled by blas lib + if (ith != 0) { + return; + } + + //const int64_t tgemm0 = ggml_perf_time_us(); for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i12 = 0; i12 < ne12; i12++) { - // broadcast src0 into src1 across 2nd,3rd dimension const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; @@ -9963,17 +10131,7 @@ static void ggml_compute_forward_mul_mat( float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { - float * const wdata = params->wdata; - ggml_to_float_t const to_float = type_traits[type].to_float; - - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - to_float((const char *) x + i01*nb01, wdata + id, ne00); - id += ne00; - } - - assert(id*sizeof(float) <= params->wsize); - x = wdata; + x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -9983,6 +10141,7 @@ static void ggml_compute_forward_mul_mat( 0.0f, d, ne01); } } + //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9991,6 +10150,9 @@ static void ggml_compute_forward_mul_mat( #endif if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } if (src1->type != vec_dot_type) { char * wdata = params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -10155,6 +10317,9 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } char * wdata = params->wdata; if (src1->type != vec_dot_type) { const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -10340,6 +10505,9 @@ static void ggml_compute_forward_out_prod_f32( return; } #endif + if (ith != 0) { + return; + } ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } @@ -10523,6 +10691,9 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } @@ -10707,6 +10878,9 @@ static void ggml_compute_forward_set_f32( bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { + if (params->ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( @@ -11031,6 +11205,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16( // ggml_compute_forward_dup_same_cont(params, opt0, dst); if (params->type == GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memset(dst->data, 0, ggml_nbytes(dst)); } @@ -11065,6 +11242,9 @@ static void ggml_compute_forward_get_rows_back_f32( // ggml_compute_forward_dup_same_cont(params, opt0, dst); if (params->type == GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memset(dst->data, 0, ggml_nbytes(dst)); } @@ -11202,6 +11382,9 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(n_past >= 0); if (!inplace && (params->type == GGML_TASK_INIT)) { + if (ith != 0) { + return; + } // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); @@ -12172,6 +12355,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -12266,6 +12452,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -12464,6 +12653,7 @@ static void ggml_compute_forward_im2col( } } + // ggml_compute_forward_conv_transpose_2d static void ggml_compute_forward_conv_transpose_2d( @@ -12489,6 +12679,9 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) @@ -14228,6 +14421,14 @@ static void ggml_compute_forward_unary( { ggml_compute_forward_silu(params, src0, dst); } break; + case GGML_UNARY_OP_HARDSWISH: + { + ggml_compute_forward_hardswish(params, src0, dst); + } break; + case GGML_UNARY_OP_HARDSIGMOID: + { + ggml_compute_forward_hardsigmoid(params, src0, dst); + } break; default: { GGML_ASSERT(false); @@ -14291,6 +14492,9 @@ static void ggml_compute_forward_add_rel_pos_f32( const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace && params->type == GGML_TASK_INIT) { + if (params->ith != 0) { + return; + } memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); return; } @@ -16589,8 +16793,9 @@ struct ggml_compute_state_shared { const int n_threads; // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase bool (*abort_callback)(void * data); // abort ggml_graph_compute when true void * abort_callback_data; @@ -16646,6 +16851,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads + case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads { n_tasks = 1; } break; @@ -16722,7 +16929,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_SOFT_MAX: { - n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); } break; case GGML_OP_CONV_TRANSPOSE_1D: { @@ -16837,6 +17044,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { return n_tasks; } +static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_node_n = * node_n; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * node_n = atomic_load(&state->shared->node_n); + if (* node_n != last_node_n) break; + } +} + +static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_task_phase = * task_phase; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * task_phase = atomic_load(&state->shared->node_task); + if (* task_phase != last_task_phase) break; + } +} + static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; @@ -16847,7 +17082,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith, n_threads); - int node_n = -1; + int node_n = -1; + int task_phase = GGML_TASK_FINALIZE; while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { @@ -16879,7 +17115,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - struct ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = ggml_get_n_tasks(node, n_threads); @@ -16888,13 +17123,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { params.nth = n_tasks; - /* INIT */ - if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_INIT; - ggml_compute_forward(¶ms, node); - } - if (n_tasks == 1) { + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); + } + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; @@ -16915,38 +17150,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_n, node_n); + task_phase = GGML_TASK_INIT; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + atomic_store(&state->shared->node_task, task_phase); } else { - // wait for other threads to finish - const int last = node_n; - - const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT; - - while (true) { - // TODO: this sched_yield can have significant impact on the performance - either positive or negative - // depending on the workload and the operating system. - // since it is not clear what is the best approach, it should potentially become user-configurable - // ref: https://github.com/ggerganov/ggml/issues/291 - // UPD: adding the do_yield flag seems to resolve the issue universally - if (do_yield) { - sched_yield(); - } - - node_n = atomic_load(&state->shared->node_n); - if (node_n != last) break; - }; + ggml_graph_compute_thread_sync_node(&node_n, state, false); + ggml_graph_compute_thread_sync_task(&task_phase, state, false); } // check if we should stop if (node_n >= cgraph->n_nodes) break; - /* COMPUTE */ + /* INIT & COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = ggml_get_n_tasks(node, n_threads); struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_COMPUTE, + /*.type =*/ GGML_TASK_INIT, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, /*.wsize =*/ cplan->work_size, @@ -16954,8 +17175,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { }; if (state->ith < n_tasks) { + if (GGML_OP_HAS_INIT[node->op]) { + ggml_compute_forward(¶ms, node); + } + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 + // UPD: adding the do_yield flag seems to resolve the issue universally + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; + ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } + + if (state->ith < n_tasks) { + params.type = GGML_TASK_COMPUTE; ggml_compute_forward(¶ms, node); } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_FINALIZE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } } return GGML_EXIT_SUCCESS; @@ -17012,8 +17264,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node)) { if (node->src[0]->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); + // here we need memory for fully dequantized matrix from src0 + // take into account that src0 can be broadcasted into src1[2,3] + cur = ggml_type_size(GGML_TYPE_F32) + * node->src[0]->ne[0]*node->src[0]->ne[1] + * node->src[1]->ne[2]*node->src[1]->ne[3]; } } else #endif @@ -17173,6 +17428,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, + /*.node_task =*/ GGML_TASK_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; diff --git a/ggml.h b/ggml.h index e2f74412f..16ff51100 100644 --- a/ggml.h +++ b/ggml.h @@ -490,6 +490,8 @@ extern "C" { GGML_UNARY_OP_GELU, GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_SILU, + GGML_UNARY_OP_HARDSWISH, + GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_COUNT, }; @@ -1033,6 +1035,16 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // hardswish(x) = x * relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardswish( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // hardsigmoid(x) = relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardsigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, @@ -1484,6 +1496,17 @@ extern "C" { int d1, bool is_2D); + GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2d9c33c7d..f5c933a41 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -101,6 +101,7 @@ class MODEL_ARCH(IntEnum): PHI2 = auto() PLAMO = auto() CODESHELL = auto() + ORION = auto() class MODEL_TENSOR(IntEnum): @@ -151,6 +152,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -427,7 +429,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - ] + ], + MODEL_ARCH.ORION: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -452,6 +470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.ORION: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } # diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 8682765ed..5b6d4ba6b 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -107,7 +107,7 @@ class GGUFReader: offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) new_align = self.fields.get('general.alignment') if new_align is not None: - if new_align.types != [GGUFValueType.UINT64]: + if new_align.types != [GGUFValueType.UINT32]: raise ValueError('Bad type for general.alignment field') self.alignment = new_align.parts[-1][0] padding = offs % self.alignment diff --git a/llama.cpp b/llama.cpp index 550caced4..4b8d4e2fd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -198,6 +198,7 @@ enum llm_arch { LLM_ARCH_PHI2, LLM_ARCH_PLAMO, LLM_ARCH_CODESHELL, + LLM_ARCH_ORION, LLM_ARCH_UNKNOWN, }; @@ -219,6 +220,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, }; enum llm_kv { @@ -643,6 +645,25 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_ORION, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, @@ -1327,11 +1348,14 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_0_5B, MODEL_1B, MODEL_3B, + MODEL_4B, MODEL_7B, MODEL_8B, MODEL_13B, + MODEL_14B, MODEL_15B, MODEL_30B, MODEL_34B, @@ -1669,6 +1693,9 @@ struct llama_context { for (ggml_backend_t backend : backends) { ggml_backend_free(backend); } + + ggml_backend_buffer_free(buf_input); + ggml_free(ctx_input); } llama_cparams cparams; @@ -1715,8 +1742,14 @@ struct llama_context { // allocator for the input tensors ggml_tallocr * alloc = nullptr; - // temporary buffer for copying data to/from the backend - std::vector> buf_copy; + // input tensors + ggml_backend_buffer_t buf_input = nullptr; + ggml_context * ctx_input = nullptr; + struct ggml_tensor * inp_tokens; // I32 [n_batch] + struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + struct ggml_tensor * inp_pos; // I32 [n_batch] + struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] + struct ggml_tensor * inp_K_shift; // I32 [n_ctx] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -2300,18 +2333,18 @@ struct llama_model_loader { } switch (type_max) { - case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; - case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; - case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; - case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; - case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; - case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; - case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; - case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; - case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; - case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; - case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; - case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; + case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; + case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; + case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; + case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; + case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; + case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; default: @@ -2661,6 +2694,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; default: return "unknown, may not work"; } @@ -2673,6 +2707,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; case MODEL_13B: return "13B"; + case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; @@ -2876,6 +2911,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -2894,9 +2930,9 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { - case 24: model.type = e_model::MODEL_1B; break; + case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; + case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break; case 80: model.type = e_model::MODEL_70B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -2939,7 +2975,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_ORION: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_14B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3699,6 +3743,11 @@ static bool llm_load_tensors( layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + // optional bias tensors, present in Stable LM 2 1.6B + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); @@ -3917,6 +3966,38 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; + case LLM_ARCH_ORION: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + + default: throw std::runtime_error("unknown architecture"); } @@ -4082,22 +4163,24 @@ static struct ggml_tensor * llm_build_inp_embd( const llama_hparams & hparams, const llama_batch & batch, struct ggml_tensor * tok_embd, + struct ggml_tensor * inp_tokens, + struct ggml_tensor * inp_embd, const llm_build_cb & cb) { const int64_t n_embd = hparams.n_embd; struct ggml_tensor * inpL; if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0); cb(inp_tokens, "inp_tokens", -1); - inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); + inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0); } return inpL; @@ -4111,6 +4194,7 @@ static void llm_build_k_shift( const llama_cparams & cparams, const llama_kv_cache & kv, struct ggml_cgraph * graph, + struct ggml_tensor * K_shift, llm_rope_type type, int64_t n_ctx, float freq_base, @@ -4127,9 +4211,6 @@ static void llm_build_k_shift( const float beta_fast = cparams.yarn_beta_fast; const float beta_slow = cparams.yarn_beta_slow; - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - int rope_type = 0; switch (type) { @@ -4328,6 +4409,7 @@ static struct ggml_tensor * llm_build_kqv( const llama_model & model, const llama_hparams & hparams, const llama_kv_cache & kv, + struct ggml_cgraph * graph, struct ggml_tensor * wo, struct ggml_tensor * wo_b, struct ggml_tensor * q_cur, @@ -4429,6 +4511,8 @@ static struct ggml_tensor * llm_build_kqv( cb(cur, "kqv_merged_cont", il); #endif + ggml_build_forward_expand(graph, cur); + cur = ggml_mul_mat(ctx, wo, cur); if (wo_b) { cb(cur, "kqv_wo", il); @@ -4441,8 +4525,47 @@ static struct ggml_tensor * llm_build_kqv( return cur; } +static struct ggml_tensor * llm_build_kv( + struct ggml_context * ctx, + const llama_model & model, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct ggml_cgraph * graph, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * q_cur, + struct ggml_tensor * kq_mask, + int64_t n_ctx, + int32_t n_tokens, + int32_t kv_head, + int32_t n_kv, + float max_alibi_bias, + float kq_scale, + const llm_build_cb & cb, + int il) { + + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(graph, q_cur); + ggml_build_forward_expand(graph, k_cur); + ggml_build_forward_expand(graph, v_cur); + + llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); + + struct ggml_tensor * cur; + cur = llm_build_kqv(ctx, model, hparams, kv, graph, + wo, wo_b, + q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il); + cb(cur, "kqv_out", il); + + return cur; +} + struct llm_build_context { const llama_model & model; + const llama_context & lctx; const llama_hparams & hparams; const llama_cparams & cparams; const llama_batch & batch; @@ -4489,6 +4612,7 @@ struct llm_build_context { const llm_build_cb & cb, bool worst_case) : model (lctx.model), + lctx (lctx), hparams (model.hparams), cparams (lctx.cparams), batch (batch), @@ -4538,6 +4662,126 @@ struct llm_build_context { ctx0 = nullptr; } } + struct ggml_cgraph * build_orion() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + // if (model.layers[il].bq) { + // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + // cb(Qcur, "Qcur", il); + // } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + // if (model.layers[il].bk) { + // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + // cb(Kcur, "Kcur", il); + // } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + // if (model.layers[il].bv) { + // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + // cb(Vcur, "Vcur", il); + // } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -4549,20 +4793,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4598,12 +4842,6 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, Qcur); - ggml_build_forward_expand(gf, Kcur); - ggml_build_forward_expand(gf, Vcur); - Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, @@ -4618,11 +4856,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4741,20 +4977,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4799,14 +5035,13 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); // apply ALiBi for 13B model const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4863,20 +5098,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4928,11 +5163,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4987,15 +5220,15 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); @@ -5029,11 +5262,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5086,19 +5317,19 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5236,12 +5467,9 @@ struct llm_build_context { ); cb(Vcur, "Vcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - // TODO: not tested, could be broken - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5296,11 +5524,11 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); for (int il = 0; il < n_layer; ++il) { @@ -5328,11 +5556,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5388,11 +5614,11 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); inpL = llm_build_norm(ctx0, inpL, hparams, @@ -5426,11 +5652,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5483,11 +5707,11 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); for (int il = 0; il < n_layer; ++il) { @@ -5521,11 +5745,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5581,20 +5803,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5612,12 +5834,24 @@ struct llm_build_context { // compute Q and K and RoPE them struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, @@ -5633,11 +5867,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5694,20 +5926,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5750,11 +5982,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5810,20 +6040,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5873,11 +6103,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5933,20 +6161,20 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6002,11 +6230,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -6057,20 +6283,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6107,11 +6333,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = cur; @@ -6166,15 +6390,15 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); @@ -6208,11 +6432,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6266,20 +6488,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); cb(inp_pos, "inp_pos", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6319,11 +6541,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(ctx0, model, hparams, kv_self, + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6375,15 +6595,7 @@ static struct ggml_cgraph * llama_build_graph( // check if we should build the worst-case graph (for memory measurement) const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); - // keep track of the input that has already been allocated - bool alloc_inp_tokens = false; - bool alloc_inp_embd = false; - bool alloc_inp_pos = false; - bool alloc_inp_KQ_mask = false; - bool alloc_inp_K_shift = false; - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: improve handling of input and output tensors, then replace this with ggml_set_name llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -6391,112 +6603,11 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_name(cur, name); } - // - // allocate input tensors and set input data - // - - if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - ggml_tallocr_alloc(lctx.alloc, cur); - - if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) { - const int64_t n_tokens = cur->ne[0]; - - ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); + if (!lctx.cparams.offload_kqv) { + if (strcmp(name, "kqv_merged_cont") == 0) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu); } - - alloc_inp_tokens = true; - } - - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { - ggml_tallocr_alloc(lctx.alloc, cur); - - if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { - const int64_t n_embd = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur)); - } - - alloc_inp_embd = true; - } - - if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - ggml_tallocr_alloc(lctx.alloc, cur); - - if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { - const int64_t n_tokens = cur->ne[0]; - - static_assert(std::is_same::value, "llama_pos must be int32_t"); - ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur)); - } - - alloc_inp_pos = true; - } - - if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - ggml_tallocr_alloc(lctx.alloc, cur); - - if (!ggml_tallocr_is_measure(lctx.alloc)) { - const int64_t n_kv = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - float * data; - if (ggml_backend_buffer_is_host(cur->buffer)) { - data = (float *) cur->data; - } else { - lctx.buf_copy.resize(ggml_nbytes(cur)); - data = (float *) lctx.buf_copy.data(); - } - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - f = -INFINITY; - } else { - f = 0; - } - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; - } - } - } - - if (data != cur->data) { - ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); - } - } - - alloc_inp_KQ_mask = true; - } - - if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - ggml_tallocr_alloc(lctx.alloc, cur); - - if (!ggml_tallocr_is_measure(lctx.alloc)) { - const int64_t n_ctx = cur->ne[0]; - - int32_t * data; - if (ggml_backend_buffer_is_host(cur->buffer)) { - data = (int32_t *) cur->data; - } else { - lctx.buf_copy.resize(ggml_nbytes(cur)); - data = (int32_t *) lctx.buf_copy.data(); - } - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - - if (data != cur->data) { - ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); - } - } - - alloc_inp_K_shift = true; } }; @@ -6504,6 +6615,67 @@ static struct ggml_cgraph * llama_build_graph( struct llm_build_context llm(lctx, batch, cb, worst_case); + // + // set input data + // + + if (!ggml_tallocr_is_measure(lctx.alloc)) { + if (batch.token) { + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + } + + if (batch.embd) { + const int64_t n_embd = llm.n_embd; + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + } + + if (batch.pos) { + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); + } + + { + const int64_t n_kv = llm.n_kv; + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + float * data = (float *) lctx.inp_KQ_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + float f; + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + f = -INFINITY; + } else { + f = 0; + } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; + } + } + } + } + + if (llm.do_rope_shift) { + const int64_t n_ctx = llm.n_ctx; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + } + llm.init(); switch (model.arch) { @@ -6567,6 +6739,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_codeshell(); } break; + case LLM_ARCH_ORION: + { + result = llm.build_orion(); + } break; default: GGML_ASSERT(false); } @@ -7993,6 +8169,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { + // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast + // if (k >= (int32_t)candidates->size) { + // return; + // } + const int64_t t_start_sample_us = ggml_time_us(); k = std::max(k, (int) min_keep); @@ -8003,10 +8184,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { + if (k <= 128) { std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } else { + constexpr int nbuckets = 128; + constexpr float bucket_low = -10.0f; + constexpr float bucket_high = 10.0f; + constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); + constexpr float bucker_inter = -bucket_low * bucket_scale; + + std::vector bucket_idx(candidates->size); + std::vector histo(nbuckets, 0); + + for (int i = 0; i < (int)candidates->size; ++i) { + const float val = candidates->data[i].logit; + int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + ib = std::max(0, std::min(nbuckets-1, ib)); + bucket_idx[i] = ib; + ++histo[ib]; + } + int nhave = 0; + int ib = nbuckets - 1; + for ( ; ib >= 0; --ib) { + nhave += histo[ib]; + if (nhave >= k) break; + } + std::vector tmp_tokens(nhave); + auto ptr = tmp_tokens.data(); + std::vector bucket_ptrs; + bucket_ptrs.reserve(nbuckets - ib); + for (int j = nbuckets - 1; j >= ib; --j) { + bucket_ptrs.push_back(ptr); + ptr += histo[j]; + } + for (int i = 0; i < (int)candidates->size; ++i) { + int j = bucket_idx[i]; + if (j >= ib) { + *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i]; + } + } + + ptr = tmp_tokens.data(); + int ndone = 0; + for (int j = nbuckets-1; j > ib; --j) { + std::sort(ptr, ptr + histo[j], comp); + ptr += histo[j]; + ndone += histo[j]; + } + std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); + + std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + } candidates->sorted = true; } @@ -8198,6 +8426,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { + const int64_t t_start_sample_us = ggml_time_us(); + + // no need to do anything if there is only one (or zero) candidates + if(candidates_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / candidates_p->size); + + llama_sample_softmax(nullptr, candidates_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < candidates_p->size; ++i) { + float prob = candidates_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + +#ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); +#endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + double max_l_double = candidates_p->data[0].logit; + double cum_sum_double = 0.0; + for (size_t i = 0; i < candidates_p->size; ++i) { + double p = exp(candidates_p->data[i].logit - max_l_double); + candidates_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + +#ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + } +#endif + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); @@ -8786,9 +9081,13 @@ struct quantize_state_internal { const llama_model_quantize_params * params; int n_attention_wv = 0; - int n_feed_forward_w2 = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; int i_attention_wv = 0; - int i_feed_forward_w2 = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; int n_k_quantized = 0; int n_fallback = 0; @@ -8872,6 +9171,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; }; + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { + if (n_expert > 1) { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer /= n_expert; + if (sscanf(name, "blk.%d.", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); + } + } + return std::make_pair(i_layer, n_layer); + }; if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; @@ -8891,8 +9207,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty ++qs.i_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K; - ++qs.i_feed_forward_w2; + if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K; + ++qs.i_ffn_down; } else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { @@ -8929,27 +9245,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - } else if (name.find("ffn_down") != std::string::npos) { - const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - int i_layer, n_layer; - if (n_expert == 1) { - i_layer = qs.i_feed_forward_w2; - n_layer = qs.n_feed_forward_w2; - } else { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly - // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work - // for getting the current layer as I initially thought, and we need to resort to parsing the - // tensor name. - n_layer = qs.n_feed_forward_w2 / n_expert; - if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { - throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); - } - if (i_layer < 0 || i_layer >= n_layer) { - throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); - } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + new_type = GGML_TYPE_Q2_K; } + } else if (name.find("ffn_down") != std::string::npos) { + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -8979,11 +9282,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } - ++qs.i_feed_forward_w2; + ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = GGML_TYPE_Q5_K; } @@ -9001,6 +9305,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } + else if (name.find("ffn_gate") != std::string::npos) { + auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { + new_type = GGML_TYPE_Q2_K; + } + ++qs.i_ffn_gate; + } + else if (name.find("ffn_up") != std::string::npos) { + auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { + new_type = GGML_TYPE_Q2_K; + } + ++qs.i_ffn_up; + } + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + //} // IK: let's remove this, else Q2_K is almost the same as Q3_K_S //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -9055,8 +9377,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; // K-quants + case LLAMA_FTYPE_MOSTLY_Q2_K_S: case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_XS: case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -9124,12 +9447,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++qs.n_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - ++qs.n_feed_forward_w2; + ++qs.n_ffn_down; + } + else if (name.find("ffn_gate") != std::string::npos) { + ++qs.n_ffn_gate; + } + else if (name.find("ffn_up") != std::string::npos) { + ++qs.n_ffn_up; } } - if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { - LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", - __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); + if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { + LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", + __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); } size_t total_size_org = 0; @@ -9936,6 +10265,35 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } + // graph inputs + { + ggml_init_params init_params = { + /* .mem_size */ ggml_tensor_overhead()*5, + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + ctx->ctx_input = ggml_init(init_params); + + ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); + ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); + ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); + ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); + ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); + + ggml_set_name(ctx->inp_tokens, "inp_tokens"); + ggml_set_name(ctx->inp_embd, "inp_embd"); + ggml_set_name(ctx->inp_pos, "inp_pos"); + ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); + ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); + + ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); + + LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(ctx->buf_input), + ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); + } + + // scheduler and compute buffers { // buffer types used for the compute buffer of each backend std::vector backend_buft; @@ -9962,9 +10320,6 @@ struct llama_context * llama_new_context_with_model( // initialize scheduler with the worst-case graph ggml_backend_sched_init_measure(ctx->sched, gf); - // note: the number of splits during measure is higher than during inference due to the kv shift - int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); for (ggml_backend_t backend : ctx->backends) { @@ -9973,6 +10328,10 @@ struct llama_context * llama_new_context_with_model( ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } + + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); } } diff --git a/llama.h b/llama.h index e268d7a1d..7b3634aa6 100644 --- a/llama.h +++ b/llama.h @@ -107,6 +107,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -774,6 +775,14 @@ extern "C" { float p, size_t min_keep); + /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API void llama_sample_entropy( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float min_temp, + float max_temp, + float exponent_val); + LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates, diff --git a/mypy.ini b/mypy.ini index 7215a05dd..e51910ca7 100644 --- a/mypy.ini +++ b/mypy.ini @@ -4,3 +4,4 @@ allow_untyped_calls = true allow_untyped_defs = true allow_incomplete_defs = true disable_error_code = import-untyped +warn_return_any = false diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index e96372c4b..73ffcd1ca 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -243,7 +243,6 @@ int main(int argc, char** argv) { if (useQ4_1) q41.resize(n4); else q40.resize(n4); std::vector q8(n8); - std::vector H(16, 0); double sumt = 0, sumt2 = 0, maxt = 0; double sumqt = 0, sumqt2 = 0, maxqt = 0; double sum = 0, sumq = 0, exactSum = 0; diff --git a/scripts/ci-run.sh b/scripts/ci-run.sh new file mode 100755 index 000000000..06b5d9c6e --- /dev/null +++ b/scripts/ci-run.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -euo pipefail +this=$(realpath "$0"); readonly this +cd "$(dirname "$this")" +shellcheck "$this" + +if (( $# != 1 && $# != 2 )); then + cat >&2 <<'EOF' +usage: + ci-run.sh [] + +This script wraps ci/run.sh: +* If is a ramdisk, you can reduce writes to your SSD. If is not a ramdisk, keep in mind that total writes will increase by the size of . + (openllama_3b_v2: quantized models are about 30GB) +* Persistent model and data files are synced to and from , + excluding generated .gguf files. + (openllama_3b_v2: persistent files are about 6.6GB) +* defaults to ~/.cache/llama.cpp +EOF + exit 1 +fi + +cd .. # => llama.cpp repo root + +tmp="$1" +mkdir -p "$tmp" +tmp=$(realpath "$tmp") +echo >&2 "Using tmp=$tmp" + +cache="${2-$HOME/.cache/llama.cpp}" +mkdir -p "$cache" +cache=$(realpath "$cache") +echo >&2 "Using cache=$cache" + +_sync() { + local from="$1"; shift + local to="$1"; shift + + echo >&2 "Syncing from $from to $to" + mkdir -p "$from" "$to" + rsync -a "$from" "$to" --delete-during "$@" +} + +_sync "$(realpath .)/" "$tmp/llama.cpp" +_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/" + +cd "$tmp/llama.cpp" +bash ci/run.sh ci-out ci-mnt + +_sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P diff --git a/run_with_preset.py b/scripts/run-with-preset.py similarity index 98% rename from run_with_preset.py rename to scripts/run-with-preset.py index 9b4d7ecbe..a18252730 100755 --- a/run_with_preset.py +++ b/scripts/run-with-preset.py @@ -46,7 +46,7 @@ Formatting considerations: - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings. - To define a tensor split, pass a list of floats. """ -usage = "run_with_preset.py [-h] [yaml_files ...] [-- ...]" +usage = "run-with-preset.py [-h] [yaml_files ...] [-- ...]" epilog = (" -- specify additional CLI ars to be passed to the binary (override all preset files). " "Unknown args will be ignored.") diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 4d52d946b..efde0069f 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -6c1ce0bd591a430c1d3f6797d905194581c878c1 +c2448f88d17395452a587d0176d19ed87e0f7ce1 diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 000000000..092dce742 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,3 @@ +* +!*.* +test-c.o diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d7aaab843..3e40a78cd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,6 @@ function(llama_build_executable source) get_filename_component(TEST_TARGET ${source} NAME_WE) - add_executable(${TEST_TARGET} ${source}) + add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE common) endfunction() @@ -8,14 +8,20 @@ endfunction() function(llama_test_executable name source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_test(NAME ${name} COMMAND $ ${ARGN}) + set_property(TEST ${name} PROPERTY LABELS "main") endfunction() function(llama_build_and_test_executable source) + llama_build_and_test_executable_with_label(${source} "main") +endfunction() + +function(llama_build_and_test_executable_with_label source label) get_filename_component(TEST_TARGET ${source} NAME_WE) - add_executable(${TEST_TARGET} ${source}) + add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE common) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) + set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label}) endfunction() # llama_build_and_test_executable(test-double-float.cpp) # SLOW @@ -49,10 +55,12 @@ llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-grad0.cpp) # llama_build_and_test_executable(test-opt.cpp) # SLOW llama_build_and_test_executable(test-backend-ops.cpp) -llama_build_and_test_executable(test-autorelease.cpp) llama_build_and_test_executable(test-rope.cpp) +llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model") +llama_build_and_test_executable_with_label(test-autorelease.cpp "model") + # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) diff --git a/tests/get-model.cpp b/tests/get-model.cpp new file mode 100644 index 000000000..4edb685f0 --- /dev/null +++ b/tests/get-model.cpp @@ -0,0 +1,21 @@ +#include +#include +#include + +#include "get-model.h" + +char * get_model_or_exit(int argc, char *argv[]) { + char * model_path; + if (argc > 1) { + model_path = argv[1]; + + } else { + model_path = getenv("LLAMACPP_TEST_MODELFILE"); + if (!model_path || strlen(model_path) == 0) { + fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE= to silence this warning and run this test.\n\033[0m"); + exit(EXIT_SUCCESS); + } + } + + return model_path; +} diff --git a/tests/get-model.h b/tests/get-model.h new file mode 100644 index 000000000..81a3a0fef --- /dev/null +++ b/tests/get-model.h @@ -0,0 +1,2 @@ +#pragma once +char * get_model_or_exit(int, char*[]); diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp index 289c6ba6c..36a23c0bb 100644 --- a/tests/test-autorelease.cpp +++ b/tests/test-autorelease.cpp @@ -5,19 +5,15 @@ #include #include "llama.h" +#include "get-model.h" // This creates a new context inside a pthread and then tries to exit cleanly. int main(int argc, char ** argv) { - if (argc < 2) { - printf("Usage: %s model.gguf\n", argv[0]); - return 0; // intentionally return success - } + auto * model_path = get_model_or_exit(argc, argv); - const std::string fname = argv[1]; - - std::thread([&fname]() { + std::thread([&model_path]() { llama_backend_init(false); - auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params()); + auto * model = llama_load_model_from_file(model_path, llama_model_default_params()); auto * ctx = llama_new_context_with_model(model, llama_context_default_params()); llama_free(ctx); llama_free_model(model); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index db1244876..4c98bef7c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -102,7 +102,6 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - std::vector vq(ggml_blck_size(t->type)); tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); tv.insert(tv.end(), vq.begin(), vq.end()); } else { diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index 73dd33dd2..78fc41117 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -190,7 +190,6 @@ int main() index++; } - std::vector> next_stacks; std::vector next_candidates; next_candidates.resize(24); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp new file mode 100644 index 000000000..7ea4bbacc --- /dev/null +++ b/tests/test-model-load-cancel.cpp @@ -0,0 +1,27 @@ +#include "llama.h" +#include "get-model.h" + +#include + +int main(int argc, char *argv[] ) { + auto * model_path = get_model_or_exit(argc, argv); + auto * file = fopen(model_path, "r"); + if (file == nullptr) { + fprintf(stderr, "no model at '%s' found\n", model_path); + return EXIT_FAILURE; + } + + fprintf(stderr, "using '%s'\n", model_path); + fclose(file); + + llama_backend_init(false); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx){ + (void) ctx; + return progress > 0.50; + }; + auto * model = llama_load_model_from_file(model_path, params); + llama_backend_free(); + return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 32e58941c..c3b3d6629 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -5,11 +5,10 @@ #undef NDEBUG #endif -#include -#include -#include -#include #include +#include +#include +#include static void dump(const llama_token_data_array * candidates) { for (size_t i = 0; i < candidates->size; i++) { @@ -20,11 +19,11 @@ static void dump(const llama_token_data_array * candidates) { #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { - size_t n_vocab = probs.size(); + const size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); + const float logit = logf(probs[token_id]); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); } @@ -41,11 +40,11 @@ static void test_top_k(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p) { - size_t n_vocab = probs.size(); + const size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); + const float logit = logf(probs[token_id]); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); } @@ -62,11 +61,11 @@ static void test_top_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float z) { - size_t n_vocab = probs.size(); + const size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); + const float logit = logf(probs[token_id]); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); } @@ -81,12 +80,33 @@ static void test_tfs(const std::vector & probs, const std::vector } } -static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { - size_t n_vocab = probs.size(); +static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) { + const size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); + const float logit = logf(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + DUMP(&candidates_p); + llama_sample_min_p(nullptr, &candidates_p, p, 1); + DUMP(&candidates_p); + llama_sample_softmax(nullptr, &candidates_p); + + GGML_ASSERT(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + } +} + +static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { + const size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + const float logit = logf(probs[token_id]); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); } @@ -107,11 +127,11 @@ static void test_repetition_penalties( ) { GGML_ASSERT(probs.size() == expected_probs.size()); - size_t n_vocab = probs.size(); + const size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); + const float logit = logf(probs[token_id]); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); } @@ -128,6 +148,88 @@ static void test_repetition_penalties( } } +static void test_sampler_queue( + const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p +) { + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + const float logit = logf(token_id); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + llama_token min_token_id = 0; + const llama_token max_token_id = n_vocab-1; + + for (auto s : samplers_sequence) { + switch (s){ + case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; + case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break; + case 'y': GGML_ASSERT(false && "typical test not implemented"); break; + case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; + case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; + case 't': GGML_ASSERT(false && "temperature test not implemented"); break; + default : GGML_ASSERT(false && "Unknown sampler"); break; + } + + llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests + + const int size = candidates_p.size; + + if (s == 'k') { + const int expected_size = std::min(size, top_k); + min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k)); + + GGML_ASSERT(size == expected_size); + GGML_ASSERT(candidates_p.data[0].id == max_token_id); + GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + } else if (s == 'p') { + const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2; + const int softmax_numerator_target = ceilf(top_p * softmax_divisor); + + min_token_id = n_vocab; + int expected_size = 0; + int cumsum = 0; + do { // do-while because always at least one token is sampled + min_token_id--; + expected_size++; + + cumsum += min_token_id; + } while (cumsum < softmax_numerator_target); + + // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns + if (min_token_id == 1) { + min_token_id--; + expected_size += 1; + } + + GGML_ASSERT(size == expected_size); + GGML_ASSERT(candidates_p.data[0].id == max_token_id); + GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + } else if (s == 'm') { + int expected_size = ceilf((1.0f-min_p) * n_vocab); + expected_size = std::max(expected_size, 1); + expected_size = std::min(expected_size, size); + + min_token_id = floorf(min_p * n_vocab); + min_token_id = std::max(min_token_id, 1); + min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size)); + min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1)); + + GGML_ASSERT(size == expected_size); + GGML_ASSERT(candidates_p.data[0].id == max_token_id); + GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + } else { + GGML_ASSERT(false); + } + } + + printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n", + samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p); +} + int main(void) { ggml_time_init(); @@ -139,6 +241,15 @@ int main(void) { test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f); + test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); @@ -154,6 +265,34 @@ int main(void) { test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); + test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f); + test_sampler_queue(10000, "k", 1, 1.0f, 1.0f); + test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f); + test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f); + test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f); + test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12); + + test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f); + test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f); + test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f); + test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f); + test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f); + + test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "km", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f); + test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f); + + test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f); + test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f); + printf("OK\n"); return 0; diff --git a/unicode.h b/unicode.h index aeca879ea..844eff3da 100644 --- a/unicode.h +++ b/unicode.h @@ -2,8 +2,9 @@ #include #include -#include +#include #include +#include static const std::vector> digit_ranges = { {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},