Merge branch 'master' into compilade/lazier-moe-convert-hf

2025-01-13 13:52:22 +01:00 · 2024-05-25 12:49:41 -04:00 · 2024-05-25 12:49:41 -04:00 · 96a299ff60
commit 96a299ff60
parent d703fa9fa5 9588f196b1
147 changed files with 30827 additions and 17884 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_CUDA" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
@ -227,20 +226,20 @@ effectiveStdenv.mkDerivation (
        )
      ]
      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
        # and select the line that matches the current nixpkgs version of rocBLAS.
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
      ++ optionals useMetalKit [
        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
      ];
    # Environment variables needed for ROCm
    env = optionals useRocm {
      ROCM_PATH = "${rocmPackages.clr}";
      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
    };
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -0,0 +1,90 @@
 # https://github.com/actions/labeler
 Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml-kompute.h
            - ggml-kompute.cpp
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml-metal.h
            - ggml-metal.cpp
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml-sycl.h
            - ggml-sycl.cpp
            - README-sycl.md
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
            - ggml-cuda.h
            - ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
            - ggml_vk_generate_shaders.py
            - ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
            - docs/**
            - media/**
 testing:
    - changed-files:
        - any-glob-to-any-file:
            - tests/**
 build:
    - changed-files:
        - any-glob-to-any-file:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
            - codecov.yml
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
 python:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.py"
            - requirements/**
            - gguf-py/**
            - .flake8
 script:
    - changed-files:
        - any-glob-to-any-file:
            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
 server:
    - changed-files:
        - any-glob-to-any-file:
            - examples/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml.c
            - ggml.h
            - ggml-*.c
            - ggml-*.h
            - ggml-cuda/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
 embedding:
    - changed-files:
        - any-glob-to-any-file: examples/embedding/
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -271,49 +271,15 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
          name: llama-bin-ubuntu-x64.zip
-#  ubuntu-latest-cmake-sanitizer:
+  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
 #    continue-on-error: true
 #
 #    strategy:
 #      matrix:
 #        sanitizer: [ADDRESS, THREAD, UNDEFINED]
 #        build_type: [Debug, Release]
 #
 #    steps:
 #      - name: Clone
 #        id: checkout
 #        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        id: depends
 #        run: |
 #          sudo apt-get update
 #          sudo apt-get install build-essential
 #
 #      - name: Build
 #        id: cmake_build
 #        run: |
 #          mkdir build
 #          cd build
 #          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
 #          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
 #
 #      - name: Test
 #        id: cmake_test
 #        run: |
 #          cd build
 #          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
    continue-on-error: true
    strategy:
      matrix:
-        mpi_library: [mpich, libopenmpi-dev]
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
    steps:
      - name: Clone
@ -324,14 +290,44 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake -DLLAMA_MPI=ON ..
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-rpc:
    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake -DLLAMA_RPC=ON ..
          cmake --build . --config Release -j $(nproc)
      - name: Test
@ -362,6 +358,33 @@ jobs:
          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.0.2
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release -j $(nproc)
      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
          cmake --build build2 --config Release -j $(nproc)
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
@ -663,24 +686,28 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'noavx'
+          - build: 'rpc-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
+          - build: 'avx2-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx'
+          - build: 'avx-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
+          - build: 'avx512-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast'
+          - build: 'clblast-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas'
+          - build: 'openblas-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute'
+          - build: 'kompute-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'vulkan'
+          - build: 'vulkan-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'arm64'
+          - build: 'llvm-arm64'
-            defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -691,13 +718,13 @@ jobs:
      - name: Clone Kompute submodule
        id: clone_kompute
-        if: ${{ matrix.build == 'kompute' }}
+        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
          git submodule update --init kompute
      - name: Download OpenCL SDK
        id: get_opencl
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
          mkdir $env:RUNNER_TEMP/opencl
@ -705,7 +732,7 @@ jobs:
      - name: Download CLBlast
        id: get_clblast
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@ -718,7 +745,7 @@ jobs:
      - name: Download OpenBLAS
        id: get_openblas
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@ -731,38 +758,41 @@ jobs:
      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
+        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Install Ninja
        id: install_ninja
        run: |
          choco install ninja
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
+          cmake -S . -B build ${{ matrix.defines }}
-          cd build
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Add clblast.dll
        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
      - name: Add libopenblas.dll
        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
        run: |
          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
      - name: Check AVX512F support
        id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
+        if: ${{ matrix.build == 'avx512-x64' }}
        continue-on-error: true
        run: |
          cd build
@ -776,14 +806,14 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
      - name: Test (Intel SDE)
        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
@ -811,14 +841,14 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}-x64.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
  windows-latest-cmake-cuda:
    runs-on: windows-latest
@ -898,9 +928,9 @@ jobs:
        shell: bash
    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
        id: checkout
@ -932,6 +962,17 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
      - name: Upload artifacts
@ -941,6 +982,37 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip
  windows-latest-cmake-hip:
    runs-on: windows-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
      - name: Verify ROCm
        id: verify
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - name: Build
        id: cmake_build
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release
  ios-xcode-build:
    runs-on: macos-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -42,8 +42,9 @@ jobs:
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          # TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
-          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
+          #- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
          #- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -0,0 +1,17 @@
 name: "Pull Request Labeler"
 on:
 - pull_request_target
 jobs:
  labeler:
    permissions:
      contents: read
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -32,10 +32,8 @@ jobs:
    strategy:
      matrix:
-        # TODO: temporary disabled due to linux kernel issues
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        #sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [RelWithDebInfo]
        sanitizer: [UNDEFINED]
        build_type: [Debug]
        include:
          - build_type: Release
            sanitizer: ""
@ -102,10 +100,8 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@ -1,29 +0,0 @@
 name: Zig CI
 on:
  pull_request:
  push:
    branches:
      - master
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  build:
    strategy:
      fail-fast: false
      matrix:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
          fetch-depth: 0
      - uses: goto-bus-stop/setup-zig@v2
        with:
          version: 0.11.0
      - name: Build Summary
        run: zig build --summary all -freference-trace
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
@ -72,11 +72,13 @@ else()
    set(INS_ENB ON)
 endif()
 option(LLAMA_SVE                             "llama: enable SVE"                                OFF)
 option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
 option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
 option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
 option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
 option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
 option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                        OFF)
 option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
@ -122,8 +124,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "llama: metal minimum macOS version")
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
-option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@ -133,6 +134,8 @@ set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeli
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 option(LLAMA_LASX                            "llama: enable lasx"                               ON)
 option(LLAMA_LSX                             "llama: enable lsx"                                ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
@ -381,10 +384,6 @@ if (LLAMA_LLAMAFILE)
    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
 if (LLAMA_CUBLAS)
    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
    set(LLAMA_CUDA ON)
@ -465,33 +464,15 @@ if (LLAMA_CUDA)
    endif()
 endif()
-if (LLAMA_MPI)
+if (LLAMA_RPC)
-    cmake_minimum_required(VERSION 3.10)
+    add_compile_definitions(GGML_USE_RPC)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
-        set(GGML_HEADERS_MPI ggml-mpi.h)
+    if (WIN32)
-        set(GGML_SOURCES_MPI ggml-mpi.c)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
            add_compile_options(-Wno-cast-qual)
        endif()
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
        # C++ functions, so more linkage is needed
        if (MPI_CXX_FOUND)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
        endif()
    else()
        message(WARNING "MPI not found")
    endif()
    set(GGML_HEADERS_RPC ggml-rpc.h)
    set(GGML_SOURCES_RPC ggml-rpc.cpp)
 endif()
 if (LLAMA_CLBLAST)
@ -520,6 +501,12 @@ if (LLAMA_VULKAN)
        add_compile_definitions(GGML_USE_VULKAN)
        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
        endif()
        if (LLAMA_VULKAN_CHECK_RESULTS)
            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
        endif()
@ -543,16 +530,37 @@ if (LLAMA_VULKAN)
 endif()
 if (LLAMA_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    if ($ENV{ROCM_PATH})
        set(ROCM_PATH $ENV{ROCM_PATH})
    else()
        set(ROCM_PATH /opt/rocm)
    endif()
    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+    # CMake on Windows doesn't support the HIP language yet
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    if(WIN32)
        set(CXX_IS_HIPCC TRUE)
    else()
        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
    endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    if(CXX_IS_HIPCC)
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+        if(LINUX)
-    endif()
+            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
            endif()
            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
                    " Prefer setting the HIP compiler directly. See README for details.")
        endif()
    else()
        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
        if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
        endif()
        cmake_minimum_required(VERSION 3.21)
        enable_language(HIP)
    endif()
    find_package(hip     REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
@ -586,13 +594,18 @@ if (LLAMA_HIPBLAS)
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    if (CXX_IS_HIPCC)
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
    else()
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
    endif()
    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (LLAMA_SYCL)
@ -995,6 +1008,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@ -1023,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
        if (LLAMA_SVE)
            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
        endif()
    endif()
 elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@ -1047,6 +1068,10 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
        elseif (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
@ -1078,6 +1103,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        if (LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
@ -1087,6 +1115,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
    message(STATUS "loongarch64 detected")
    list(APPEND ARCH_FLAGS -march=loongarch64)
    if (LLAMA_LASX)
        list(APPEND ARCH_FLAGS -mlasx)
    endif()
    if (LLAMA_LSX)
        list(APPEND ARCH_FLAGS -mlsx)
    endif()
 else()
    message(STATUS "Unknown architecture")
 endif()
@ -1175,7 +1214,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
@ -1262,7 +1301,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
+        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -0,0 +1,45 @@
 {
  "version": 4,
  "configurePresets": [
    {
        "name":  "base",
        "hidden": true,
        "generator":   "Ninja",
        "binaryDir":   "${sourceDir}/build-${presetName}",
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
    {
        "name": "arm64-windows-msvc", "hidden": true,
        "architecture": { "value": "arm64",       "strategy": "external" },
        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
    },
    {
        "name": "arm64-windows-llvm", "hidden": true,
        "architecture": { "value": "arm64",       "strategy": "external" },
        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
    },
    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] }
  ]
 }
--- a/27
+++ b/27
@ -379,15 +379,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
 	CUDA_POWER_ARCH = 1
 endif
 ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CFLAGS   += -mlasx
 	MK_CXXFLAGS += -mlasx
 endif
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 ifdef LLAMA_QKK_64
 	MK_CPPFLAGS += -DGGML_QKK_64
 endif
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@ -399,13 +400,6 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_MPI
 	MK_CPPFLAGS += -DGGML_USE_MPI
 	MK_CFLAGS   += -Wno-cast-qual
 	MK_CXXFLAGS += -Wno-cast-qual
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@ -560,10 +554,10 @@ endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
@ -575,7 +569,7 @@ ifdef LLAMA_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@ -629,11 +623,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_MPI
 ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
--- a/README.md
+++ b/README.md
@ -107,7 +107,6 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@ -128,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
@ -141,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 **HTTP server**
@ -301,7 +302,7 @@ cd llama.cpp
 ### Build
-In order to build llama.cpp you have three different options.
+In order to build llama.cpp you have four different options.
 - Using `make`:
  - On Linux or MacOS:
@ -382,45 +383,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ### MPI Build
 MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
 First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
 Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
 - Using `make`:
  ```bash
  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
  ```
 - Using `CMake`:
  ```bash
  cmake -S . -B build -DLLAMA_MPI=ON
  ```
 Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
 Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
 Here is an example hostfile:
 ```
 192.168.0.1:2
 malvolio.local:1
 ```
 The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
 Finally, you're ready to run a computation using `mpirun`:
 ```bash
 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 ### BLAS Build
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
@ -528,13 +490,28 @@ Building the program with BLAS support may lead to some performance improvements
    ```
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
+    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
    Note that if you get the following error:
    ```
    clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
    ```
    Try searching for a directory under `HIP_PATH` that contains the file
    `oclc_abi_version_400.bc`. Then, add the following to the start of the
    command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
    like:
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build -- -j 16
    ```
  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
@ -543,10 +520,8 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
-    mkdir build
+    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
-    cd build
+    cmake --build build
    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@ -712,6 +687,9 @@ Building the program with BLAS support may lead to some performance improvements
 ### Prepare and Quantize
 > [!NOTE]
 > You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
--- a/build.zig
+++ b/build.zig
@ -1,172 +0,0 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
 const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
 const CrossTarget = std.zig.CrossTarget;
 const Maker = struct {
    builder: *std.build.Builder,
    target: CrossTarget,
    optimize: Mode,
    enable_lto: bool,
    include_dirs: ArrayList([]const u8),
    cflags: ArrayList([]const u8),
    cxxflags: ArrayList([]const u8),
    objs: ArrayList(*Compile),
    fn addInclude(m: *Maker, dir: []const u8) !void {
        try m.include_dirs.append(dir);
    }
    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
    }
    fn addCFlag(m: *Maker, flag: []const u8) !void {
        try m.cflags.append(flag);
    }
    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
        try m.cxxflags.append(flag);
    }
    fn addFlag(m: *Maker, flag: []const u8) !void {
        try m.addCFlag(flag);
        try m.addCxxFlag(flag);
    }
    fn init(builder: *std.build.Builder) !Maker {
        const target = builder.standardTargetOptions(.{});
        const zig_version = @import("builtin").zig_version_string;
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
            \\int LLAMA_BUILD_NUMBER = {};
            \\char const *LLAMA_COMMIT = "{s}";
            \\char const *LLAMA_COMPILER = "Zig {s}";
            \\char const *LLAMA_BUILD_TARGET = "{s}";
            \\
        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
        var m = Maker{
            .builder = builder,
            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
        try m.addProjectInclude(&.{"common"});
        return m;
    }
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            if (o.target.getAbi() == .msvc) {
                o.linkLibC(); // need winsdk + crt
            } else {
                // linkLibCpp already add (libc++ + libunwind + libc)
                o.linkLibCpp();
            }
        }
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
    }
    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
        e.addCSourceFiles(&.{src}, m.cxxflags.items);
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
        // https://github.com/ziglang/zig/issues/15448
        if (e.target.getAbi() == .msvc) {
            e.linkLibC(); // need winsdk + crt
        } else {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
        return e;
    }
 };
 pub fn build(b: *std.build.Builder) !void {
    var make = try Maker.init(b);
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    const ggml = make.obj("ggml", "ggml.c");
    const sgemm = make.obj("sgemm", "sgemm.cpp");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const unicode = make.obj("unicode", "unicode.cpp");
    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
    const llama = make.obj("llama", "llama.cpp");
    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
    for (server_assets) |asset| {
        const input_path = b.fmt("examples/server/public/{s}", .{asset});
        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
        defer b.allocator.free(input);
        var buf = std.ArrayList(u8).init(b.allocator);
        defer buf.deinit();
        for (input) |byte| {
            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
        }
        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
        defer b.allocator.free(name);
        std.mem.replaceScalar(u8, name, '.', '_');
        try std.fs.cwd().writeFile(output_path, b.fmt(
            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
            .{ name, buf.items, name, input.len },
        ));
        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
 }
 function gg_get_model {
-    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    if [[ -s $gguf_3b ]]; then
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-        echo -n "$gguf_3b"
+    if [[ -s $gguf_0 ]]; then
-    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_0"
-        echo -n "$gguf_7b"
+    elif [[ -s $gguf_1 ]]; then
        echo -n "$gguf_1"
    elif [[ -s $gguf_2 ]]; then
        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }
 # open_llama_3b_v2
 function gg_run_open_llama_3b_v2 {
    cd ${SRC}
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/open-llama/3B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_open_llama_3b_v2 {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA
@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert.py ${path_models}
+    python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # pythia_1.4b
 function gg_run_pythia_1_4b {
    cd ${SRC}
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/pythia/1.4B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_1_4b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 1.4B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # pythia_2_8b
 # requires: GG_BUILD_CUDA
 function gg_run_pythia_2_8b {
    cd ${SRC}
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/pythia/2.8B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_2_8b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 2.8B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # bge-small
 function gg_run_embd_bge_small {
@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert-hf-to-gguf.py ${path_models}
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
+            test $ret -eq 0 && gg_run pythia_1_4b
        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@ -0,0 +1,16 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-pc-windows-msvc )
 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@ -0,0 +1,6 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-pc-windows-msvc )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -27,7 +27,7 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
@ -35,14 +35,18 @@
 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const *LLAMA_COMMIT;
+extern char const * LLAMA_COMMIT;
-extern char const *LLAMA_COMPILER;
+extern char const * LLAMA_COMPILER;
-extern char const *LLAMA_BUILD_TARGET;
+extern char const * LLAMA_BUILD_TARGET;
 struct llama_control_vector_load_info;
-int get_math_cpu_count();
+//
-int32_t get_num_physical_cores();
+// CPU utils
 //
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
 //
 // CLI argument parsing
@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-    int32_t n_threads             = get_math_cpu_count();
+    int32_t n_threads             = cpu_get_num_math();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
@ -82,6 +86,7 @@ struct gpt_params {
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
    std::string rpc_servers       = "";    // comma separated list of RPC servers
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@ -141,6 +146,7 @@ struct gpt_params {
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
    bool no_special        = false; // disable control token output
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
@ -178,33 +184,34 @@ struct gpt_params {
 void gpt_params_handle_model_default(gpt_params & params);
-bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
 bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
 bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
 void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+std::string gpt_params_get_system_info(const gpt_params & params);
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
 std::string get_system_info(const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 void process_escapes(std::string& input);
 bool validate_file_name(const std::string & filename);
 //
 // String utils
 //
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
-std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
+std::string string_get_sortable_timestamp();
 std::string string_random_prompt(std::mt19937 & rng);
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 //
 // Filesystem utils
 //
 bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 //
 // Model utils
@ -275,29 +282,15 @@ std::string llama_detokenize_bpe(
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);
 //
 // YAML utils
 //
 bool create_directory_with_parents(const std::string & path);
 void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
 void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
 void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
 std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
 //
 // KV cache utils
 //
 // Dump the KV cache view with the number of sequences per cell.
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 //
 // Embedding utils
@ -331,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 // Split utils
 //
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //
 // YAML utils
 //
 void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
 void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 void yaml_dump_non_result_info(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -26,7 +26,7 @@ namespace grammar_parser {
    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
+        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
        return result.first->second;
    }
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -272,7 +272,7 @@ private:
                    if (literal.empty()) {
                        return false;
                    }
-                    ret.push_back(std::make_pair(literal, true));
+                    ret.emplace_back(literal, true);
                    literal.clear();
                    return true;
                };
@ -298,7 +298,7 @@ private:
            while (i < length) {
                char c = sub_pattern[i];
                if (c == '.') {
-                    seq.push_back(std::make_pair(get_dot(), false));
+                    seq.emplace_back(get_dot(), false);
                    i++;
                } else if (c == '(') {
                    i++;
@ -307,7 +307,7 @@ private:
                            _warnings.push_back("Unsupported pattern syntax");
                        }
                    }
-                    seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
+                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
                } else if (c == ')') {
                    i++;
                    if (start > 0 && sub_pattern[start - 1] != '(') {
@ -331,9 +331,9 @@ private:
                    }
                    square_brackets += ']';
                    i++;
-                    seq.push_back(std::make_pair(square_brackets, false));
+                    seq.emplace_back(square_brackets, false);
                } else if (c == '|') {
-                    seq.push_back(std::make_pair("|", false));
+                    seq.emplace_back("|", false);
                    i++;
                } else if (c == '*' || c == '+' || c == '?') {
                    seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@ -417,7 +417,7 @@ private:
                        }
                    }
                    if (!literal.empty()) {
-                        seq.push_back(std::make_pair(literal, true));
+                        seq.emplace_back(literal, true);
                    }
                }
            }
--- a/common/log.h
+++ b/common/log.h
@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_FLF_FMT "%s"
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_TEE_FLF_FMT "%s"
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif
 // LOG macro variants with auto endline.
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
+            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
            if (!sampler_type_name.empty()) {
                result += "-> " + sampler_type_name + " ";
            }
@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    return result;
 }
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
    switch (sampler_type) {
        case llama_sampler_type::TOP_K:       return "top_k";
        case llama_sampler_type::TFS_Z:       return "tfs_z";
        case llama_sampler_type::TYPICAL_P:   return "typical_p";
        case llama_sampler_type::TOP_P:       return "top_p";
        case llama_sampler_type::MIN_P:       return "min_p";
        case llama_sampler_type::TEMPERATURE: return "temperature";
        default : return "";
    }
 }
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
        {"top_p",       llama_sampler_type::TOP_P},
        {"typical_p",   llama_sampler_type::TYPICAL_P},
        {"min_p",       llama_sampler_type::MIN_P},
        {"tfs_z",       llama_sampler_type::TFS_Z},
        {"temperature", llama_sampler_type::TEMPERATURE}
    };
    // since samplers names are written multiple ways
    // make it ready for both system names and input names
    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
        {"top-k",       llama_sampler_type::TOP_K},
        {"top-p",       llama_sampler_type::TOP_P},
        {"nucleus",     llama_sampler_type::TOP_P},
        {"typical-p",   llama_sampler_type::TYPICAL_P},
        {"typical",     llama_sampler_type::TYPICAL_P},
        {"min-p",       llama_sampler_type::MIN_P},
        {"tfs-z",       llama_sampler_type::TFS_Z},
        {"tfs",         llama_sampler_type::TFS_Z},
        {"temp",        llama_sampler_type::TEMPERATURE}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names.size());
    for (const auto & name : names)
    {
        auto sampler_item = sampler_canonical_name_map.find(name);
        if (sampler_item != sampler_canonical_name_map.end())
        {
            sampler_types.push_back(sampler_item->second);
        }
        else
        {
            if (allow_alt_names)
            {
                sampler_item = sampler_alt_name_map.find(name);
                if (sampler_item != sampler_alt_name_map.end())
                {
                    sampler_types.push_back(sampler_item->second);
                }
            }
        }
    }
    return sampler_types;
 }
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
    std::unordered_map<char, llama_sampler_type> sampler_name_map {
        {'k', llama_sampler_type::TOP_K},
        {'p', llama_sampler_type::TOP_P},
        {'y', llama_sampler_type::TYPICAL_P},
        {'m', llama_sampler_type::MIN_P},
        {'f', llama_sampler_type::TFS_Z},
        {'t', llama_sampler_type::TEMPERATURE}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names_string.size());
    for (const auto & c : names_string) {
        const auto sampler_item = sampler_name_map.find(c);
        if (sampler_item != sampler_name_map.end()) {
            sampler_types.push_back(sampler_item->second);
        }
    }
    return sampler_types;
 }
 // no reasons to expose this function in header
 static void sampler_queue(
                   struct llama_context * ctx_main,
@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx,
-                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
+                  bool is_resampling) {
    const llama_sampling_params & params = ctx_sampling->params;
    const float   temp            = params.temp;
@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
    const float   mirostat_eta    = params.mirostat_eta;
    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (!is_resampling) {
+    if (ctx_sampling->grammar != NULL && !is_resampling) {
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
@ -252,7 +333,7 @@ static llama_token llama_sampling_sample_impl(
            // Restore logits from the copy
            std::copy(original_logits.begin(), original_logits.end(), logits);
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
        }
    }
@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
-    if (apply_grammar && original_logits != NULL) {
+    if (ctx_sampling->grammar != NULL && !apply_grammar) {
        GGML_ASSERT(original_logits != NULL);
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
    }
@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
                  struct llama_context * ctx_cfg,
                  const int idx) {
    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
 }
 llama_token_data_array llama_sampling_prepare(
--- a/common/sampling.h
+++ b/common/sampling.h
@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
 // Print sampling order into a string
 std::string llama_sampling_order_print(const llama_sampling_params & params);
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
    params.custom_n_ctx = false;
-    params.use_flash              = true;
+    params.use_flash              = false;
    params.use_checkpointing      = true;
    params.sample_start           = "";
@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
 void finish_processing_train_args(struct train_params_common * params) {
    if (params->escape) {
-        process_escapes(params->sample_start);
+        string_process_escapes(params->sample_start);
    }
 }
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -20,11 +20,13 @@
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 import logging
 import os
 import pathlib
 import re
 import requests
 import sys
 import json
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert-hf-to-gguf-update")
 sess = requests.Session()
 class TOKENIZER_TYPE(IntEnum):
@ -69,73 +72,55 @@ models = [
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-en",        "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-es",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-de",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
 ]
 # make directory "models/tokenizers" if it doesn't exist
 if not os.path.exists("models/tokenizers"):
    os.makedirs("models/tokenizers")
 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
-    response = requests.get(url, headers=headers)
+    response = sess.get(url, headers=headers)
-    if response.status_code == 200:
+    response.raise_for_status()
-        with open(save_path, 'wb') as f:
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-            f.write(response.content)
+    with open(save_path, 'wb') as f:
-        logger.info(f"File {save_path} downloaded successfully")
+        f.write(response.content)
-    else:
+    logger.info(f"File {save_path} downloaded successfully")
        logger.info(f"Failed to download file. Status code: {response.status_code}")
-# download the tokenizer models
+def download_model(model):
 for model in models:
    name = model["name"]
    repo = model["repo"]
    tokt = model["tokt"]
-    if not os.path.exists(f"models/tokenizers/{name}"):
+    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
        os.makedirs(f"models/tokenizers/{name}")
    else:
        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
        continue
    logger.info(f"Downloading {name} to models/tokenizers/{name}")
    url = f"{repo}/raw/main/config.json"
    save_path = f"models/tokenizers/{name}/config.json"
    download_file_with_auth(url, token, save_path)
    url = f"{repo}/raw/main/tokenizer.json"
    save_path = f"models/tokenizers/{name}/tokenizer.json"
    download_file_with_auth(url, token, save_path)
    # if downloaded file is less than 1KB, we likely need to download an LFS instead
    if os.path.getsize(save_path) < 1024:
        # remove the file
        os.remove(save_path)
        url = f"{repo}/resolve/main/tokenizer.json"
        save_path = f"models/tokenizers/{name}/tokenizer.json"
        download_file_with_auth(url, token, save_path)
    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
    if tokt == TOKENIZER_TYPE.SPM:
-        url = f"{repo}/resolve/main/tokenizer.model"
+        files.append("tokenizer.model")
-        save_path = f"models/tokenizers/{name}/tokenizer.model"
+
-        download_file_with_auth(url, token, save_path)
+    for file in files:
        save_path = f"models/tokenizers/{name}/{file}"
        if os.path.isfile(save_path):
            logger.info(f"{name}: File {save_path} already exists - skipping")
            continue
        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
 for model in models:
    try:
        download_model(model)
    except Exception as e:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")
    url = f"{repo}/raw/main/tokenizer_config.json"
    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
    download_file_with_auth(url, token, save_path)
 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
 # TODO: auto-update convert-hf-to-gguf.py with the generated function
 src_ifs = ""
 for model in models:
@ -224,11 +209,18 @@ src_func = f"""
        return res
 """
-print(src_func) # noqa: NP100
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
 convert_py = convert_py_pth.read_text()
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
    convert_py,
    flags=re.DOTALL | re.MULTILINE,
 )
-logger.info("\n")
+convert_py_pth.write_text(convert_py)
-logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+
-logger.info("\n")
+logger.info("+++ convert-hf-to-gguf.py was updated")
 # generate tests for each tokenizer model
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -14,6 +14,7 @@ from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
 import math
 import numpy as np
 import torch
@ -240,23 +241,6 @@ class Model:
        return False
    def write_tensors(self):
        # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
        def np_fp32_to_bf16(n: np.ndarray):
            # force nan to quiet
            n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
            # flush subnormals to zero
            n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
            # round to nearest even
            n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
            return n.astype(np.int16)
        # Doing this row-wise is much, much faster than element-wise, hence the signature
        v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
        if self.lazy:
            # TODO: find a way to implicitly wrap np.vectorize functions
            # NOTE: the type is changed to reflect otypes passed to np.vectorize above
            v_fp32_to_bf16 = gguf.LazyNumpyTensor._wrap_fn(v_fp32_to_bf16, meta_noop=np.int16)
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
        for name, data_torch in self.get_tensors():
@ -310,27 +294,30 @@ class Model:
                ))
                if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
-                    if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                    if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                        data = gguf.quantize_bf16(data)
                        assert data.dtype == np.int16
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
                        data = gguf.quantize_q8_0(data)
                        assert data.dtype == np.uint8
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
                    else:  # default to float16 for quantized tensors
                        if data_dtype != np.float16:
                            data = data.astype(np.float16)
                        data_qtype = gguf.GGMLQuantizationType.F16
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                if data_qtype is None:  # by default, convert to float32
                        if data_dtype != np.float32:
                            data = data.astype(np.float32)
                        data = v_fp32_to_bf16(data.view(np.int32))
                        assert data.dtype == np.int16
                        data_qtype = gguf.GGMLQuantizationType.BF16
                else:  # by default, convert to float32
                    if data_dtype != np.float32:
                        data = data.astype(np.float32)
                    data_qtype = gguf.GGMLQuantizationType.F32
-                assert data_qtype is not None
+                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
                # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
                # n_dims is implicit in the shape
                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@ -416,6 +403,7 @@ class Model:
    # NOTE: this function is generated by convert-hf-to-gguf-update.py
    #       do not modify it manually!
    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
@ -459,6 +447,9 @@ class Model:
        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
            # ref: https://huggingface.co/openai-community/gpt2
            res = "gpt-2"
        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
            res = "stablelm2"
        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
            res = "refact"
@ -476,13 +467,13 @@ class Model:
            res = "dbrx"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
-            res = "jina-en"
+            res = "jina-v2-en"
        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
-            res = "jina-es"
+            res = "jina-v2-es"
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
-            res = "jina-de"
+            res = "jina-v2-de"
        if res is None:
            logger.warning("\n")
@ -503,6 +494,7 @@ class Model:
        logger.debug(f"chkhsh: {chkhsh}")
        return res
        # Marker: End get_vocab_base_pre
    def _set_vocab_gpt2(self) -> None:
        tokens, toktypes, tokpre = self.get_vocab_base()
@ -540,7 +532,7 @@ class Model:
        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
        added_vocab = tokenizer.special_tokens
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
        for i in range(vocab_size):
            if i not in reverse_vocab:
@ -585,6 +577,10 @@ class Model:
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
        for token_id in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(token_id)
            text = piece.encode("utf-8")
@ -600,21 +596,23 @@ class Model:
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
-            tokens.append(text)
+            tokens[token_id] = text
-            scores.append(score)
+            scores[token_id] = score
-            toktypes.append(toktype)
+            toktypes[token_id] = toktype
        added_tokens_file = self.dir_model / 'added_tokens.json'
        if added_tokens_file.is_file():
            with open(added_tokens_file, "r", encoding="utf-8") as f:
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
-                    key = key.encode("utf-8")
+                    token_id = added_tokens_json[key]
-                    if key not in tokens:
+                    if (token_id >= vocab_size):
-                        tokens.append(key)
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
-                        scores.append(-1000.0)
+                        continue
-                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+
                    tokens[token_id] = key.encode("utf-8")
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
        if vocab_size > len(tokens):
            pad_count = vocab_size - len(tokens)
@ -624,8 +622,6 @@ class Model:
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
        assert len(tokens) == vocab_size
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
@ -677,6 +673,44 @@ class GPTNeoXModel(Model):
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        tensors: list[tuple[str, Tensor]] = []
        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
            # Map bloom-style qkv_linear to gpt-style qkv_linear
            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
            data_torch = torch.cat(
                (
                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
                ),
                dim=0,
            )
            logger.info("re-format attention.linear_qkv.weight")
        elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
            data_torch = torch.cat(
                (
                    qkv_bias[:, 0, :].reshape((n_embed,)),
                    qkv_bias[:, 1, :].reshape((n_embed,)),
                    qkv_bias[:, 2, :].reshape((n_embed,)),
                ),
                dim=0,
            )
            logger.info("re-format attention.linear_qkv.bias")
        tensors.append((self.map_tensor_name(name), data_torch))
        return tensors
@Model.register("BloomForCausalLM")
 class BloomModel(Model):
@ -856,6 +890,7 @@ class BaichuanModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_file_type(self.ftype)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@ -972,6 +1007,7 @@ class XverseModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_file_type(self.ftype)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@ -1139,45 +1175,6 @@ class RefactModel(Model):
        yield self.map_tensor_name(name), data_torch
@Model.register("PersimmonForCausalLM")
 class PersimmonModel(Model):
    model_arch = gguf.MODEL_ARCH.PERSIMMON
    def set_gguf_parameters(self):
        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = head_count
        hidden_size = self.hparams["hidden_size"]
        self.gguf_writer.add_name('persimmon-8b-chat')
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hidden_size)
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
        #       than the head size?
        #       ref: https://github.com/ggerganov/llama.cpp/pull/4889
        # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
        self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
    def set_vocab(self):
        self._set_vocab_sentencepiece()
        # self.gguf_writer.add_bos_token_id(71013)
        # self.gguf_writer.add_eos_token_id(71013)
    def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
        del name, new_name, bid, n_dims  # unused
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
        return True
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
 class StableLMModel(Model):
    model_arch = gguf.MODEL_ARCH.STABLELM
@ -1204,6 +1201,7 @@ class StableLMModel(Model):
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
        self.gguf_writer.add_file_type(self.ftype)
    _q_norms: list[dict[str, Tensor]] | None = None
    _k_norms: list[dict[str, Tensor]] | None = None
@ -1572,6 +1570,7 @@ class QwenModel(Model):
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
@Model.register("Qwen2ForCausalLM")
@ -1753,6 +1752,38 @@ class Phi3MiniModel(Model):
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
                for token_id, foken_data in added_tokens_decoder.items():
                    token_id = int(token_id)
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert tokens[token_id] == token
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        tokenizer_file = self.dir_model / 'tokenizer.json'
        if tokenizer_file.is_file():
            with open(tokenizer_file, "r", encoding="utf-8") as f:
                tokenizer_json = json.load(f)
                added_tokens = tokenizer_json.get("added_tokens", [])
                for foken_data in added_tokens:
                    token_id = int(foken_data["id"])
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert tokens[token_id] == token
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
@ -1765,23 +1796,59 @@ class Phi3MiniModel(Model):
    def set_gguf_parameters(self):
        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
        rot_pct = 1.0
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
        rope_dims = n_embd // n_head
        self.gguf_writer.add_name("Phi3")
-        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+        self.gguf_writer.add_context_length(max_pos_embds)
-
+        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(8192)
+        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
-        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+        self.gguf_writer.add_rope_dimension_count(rope_dims)
        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
        self.gguf_writer.add_file_type(self.ftype)
        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
        if (rope_scaling is None):
            return
        scale = max_pos_embds / orig_max_pos_embds
        rope_scaling_type = rope_scaling.get('type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')
        if rope_scaling_type == 'su':
            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
        elif rope_scaling_type == 'yarn':
            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
        else:
            raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
        long_factors = rope_scaling.get('long_factor', None)
        short_factors = rope_scaling.get('short_factor', None)
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
@ -1802,6 +1869,7 @@ class PlamoModel(Model):
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
        self.gguf_writer.add_file_type(self.ftype)
    def shuffle_attn_q_weight(self, data_torch):
        assert data_torch.size() == (5120, 5120)
@ -1979,6 +2047,7 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_file_type(self.ftype)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@ -2367,6 +2436,157 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_eos_token(True)
@Model.register("ArcticForCausalLM")
 class ArcticModel(Model):
    model_arch = gguf.MODEL_ARCH.ARCTIC
    def set_vocab(self):
        # The reason for using a custom implementation here is that the
        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
        from sentencepiece import SentencePieceProcessor
        tokenizer_path = self.dir_model / 'tokenizer.model'
        if not tokenizer_path.is_file():
            logger.error(f'Error: Missing {tokenizer_path}')
            sys.exit(1)
        # Read the whole vocabulary from the tokenizer.model file
        tokenizer = SentencePieceProcessor()
        tokenizer.LoadFromFile(str(tokenizer_path))
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
        for token_id in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(token_id)
            text = piece.encode("utf-8")
            score = tokenizer.GetScore(token_id)
            toktype = SentencePieceTokenTypes.NORMAL
            if tokenizer.IsUnknown(token_id):
                toktype = SentencePieceTokenTypes.UNKNOWN
            elif tokenizer.IsControl(token_id):
                toktype = SentencePieceTokenTypes.CONTROL
            elif tokenizer.IsUnused(token_id):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            tokens[token_id] = text
            scores[token_id] = score
            toktypes[token_id] = toktype
        # Use the added_tokens_decoder field from tokeniser_config.json as the source
        # of information about added/redefined tokens and modify them accordingly.
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                if "added_tokens_decoder" in tokenizer_config_json:
                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
                    for token_id, token_json in added_tokens_decoder.items():
                        token_id = int(token_id)
                        if (token_id >= vocab_size):
                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                            continue
                        token_content = token_json["content"]
                        token_type = SentencePieceTokenTypes.USER_DEFINED
                        token_score = -10000.0
                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
                        # Set the score to 0.0 as in the original tokenizer.model
                        if ("special" in token_json) and token_json["special"]:
                            if token_content == tokenizer_config_json["unk_token"]:
                                token_type = SentencePieceTokenTypes.UNKNOWN
                            else:
                                token_type = SentencePieceTokenTypes.CONTROL
                            token_score = 0.0
                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
                        tokens[token_id] = token_content.encode("utf-8")
                        toktypes[token_id] = token_type
                        scores[token_id] = token_score
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
    _experts: list[dict[str, Tensor]] | None = None
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
        if name.endswith("q_proj.weight"):
            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
        if name.endswith("k_proj.weight"):
            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
        # process the experts separately
        if name.find("block_sparse_moe.experts") != -1:
            n_experts = self.hparams["num_local_experts"]
            assert bid is not None
            if self._experts is None:
                self._experts = [{} for _ in range(self.block_count)]
            self._experts[bid][name] = data_torch
            if len(self._experts[bid]) >= n_experts * 3:
                tensors: list[tuple[str, Tensor]] = []
                # merge the experts into a single 3d tensor
                for wid in ["w1", "w2", "w3"]:
                    datas: list[Tensor] = []
                    for xid in range(n_experts):
                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
                        datas.append(self._experts[bid][ename])
                        del self._experts[bid][ename]
                    data_torch = torch.stack(datas, dim=0)
                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
                    new_name = self.map_tensor_name(merged_name)
                    tensors.append((new_name, data_torch))
                return tensors
            else:
                return []
        return [(self.map_tensor_name(name), data_torch)]
    def write_tensors(self):
        super().write_tensors()
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
            experts = [k for d in self._experts for k in d.keys()]
            if len(experts) > 0:
                raise ValueError(f"Unprocessed experts: {experts}")
 ###### CONVERSION LOGIC ######
@ -2386,25 +2606,15 @@ class LazyTorchTensor(gguf.LazyBase):
    def numpy(self) -> gguf.LazyNumpyTensor:
        dtype = self._dtype_map[self.dtype]
        return gguf.LazyNumpyTensor(
-            meta=np.lib.stride_tricks.as_strided(np.zeros(1, dtype), self.shape, (0 for _ in self.shape)),
+            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
            lazy=self._lazy,
            args=(self,),
            func=(lambda s: s[0].numpy())
        )
    @classmethod
-    def eager_to_meta(cls, t: Tensor) -> Tensor:
+    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
-        if t.is_meta:
+        return torch.empty(size=shape, dtype=dtype, device="meta")
            return t
        return t.detach().to("meta")
    @classmethod
    def meta_with_dtype(cls, m: Tensor, dtype: torch.dtype) -> Tensor:
        m = m.detach()
        if not m.is_meta:
            m = m.to("meta")
        m.dtype = dtype
        return m
    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
@ -2435,8 +2645,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@ -2494,6 +2704,7 @@ def main() -> None:
        "f32": gguf.LlamaFileType.ALL_F32,
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
        "auto": gguf.LlamaFileType.GUESSED,
    }
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,143 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import logging
 import argparse
 import os
 import sys
 from pathlib import Path
 from pprint import pprint
 import torch
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 logger = logging.getLogger("persimmon-to-gguf")
 def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
    for key in dct.keys():
        new_prefix = prefix + '.' + key if prefix is not None else key
        if isinstance(dct[key], torch.Tensor):
            tensors[new_prefix] = dct[key]
        elif isinstance(dct[key], dict):
            _flatten_dict(dct[key], tensors, new_prefix)
        else:
            raise ValueError(type(dct[key]))
    return None
 def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
    logger.info('getting sentencepiece tokenizer from', tokenizer_path)
    tokenizer = SentencePieceProcessor(str(tokenizer_path))
    logger.info('adding tokens')
    tokens: list[bytes] = []
    scores: list[float] = []
    toktypes: list[int] = []
    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float
        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)
        toktype = 1
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3
        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6
        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)
        pass
    return tokens, scores, toktypes
 def main():
    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
    parser.add_argument("--adept-inference-dir", type=str,  help="path to adept-inference code directory")
    parser.add_argument("--verbose",  action="store_true",  help="increase output verbosity")
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    sys.path.append(str(args.adept_inference_dir))
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)
    arch = gguf.MODEL_ARCH.PERSIMMON
    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
    block_count = hparams.num_layers
    head_count = hparams.num_attention_heads
    head_count_kv = head_count
    ctx_length = hparams.seq_length
    hidden_size = hparams.hidden_size
    gguf_writer.add_name('persimmon-8b-chat')
    gguf_writer.add_context_length(ctx_length)
    gguf_writer.add_embedding_length(hidden_size)
    gguf_writer.add_block_count(block_count)
    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
    # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
    gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
    gguf_writer.add_head_count(head_count)
    gguf_writer.add_head_count_kv(head_count_kv)
    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
    gguf_writer.add_tokenizer_pre('default')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
    gguf_writer.add_bos_token_id(71013)
    gguf_writer.add_eos_token_id(71013)
    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    logger.info(tensor_map)
    for name in tensors.keys():
        data_torch = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
        old_dtype = data_torch.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
        data = data_torch.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            raise ValueError(f"Can not map tensor '{name}'")
        n_dims = len(data.shape)
        logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
        gguf_writer.add_tensor(new_name, data)
    logger.info("gguf: write header")
    gguf_writer.write_header_to_file()
    logger.info("gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
    logger.info("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
    logger.info(f"gguf: model successfully exported to '{args.outfile}'")
 if __name__ == '__main__':
    main()
--- a/convert.py
+++ b/convert.py
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
 import numpy as np
 from sentencepiece import SentencePieceProcessor
@ -344,10 +344,47 @@ class Params:
        return params
@dataclass
 class Metadata:
    name: Optional[str] = None
    author: Optional[str] = None
    version: Optional[str] = None
    url: Optional[str] = None
    description: Optional[str] = None
    licence: Optional[str] = None
    source_url: Optional[str] = None
    source_hf_repo: Optional[str] = None
    @staticmethod
    def load(metadata_path: Path) -> Metadata:
        if metadata_path is None or not metadata_path.exists():
            return Metadata()
        with open(metadata_path, 'r') as file:
            data = json.load(file)
        # Create a new Metadata instance
        metadata = Metadata()
        # Assigning values to Metadata attributes if they exist in the JSON file
        # This is based on LLM_KV_NAMES mapping in llama.cpp
        metadata.name = data.get("general.name")
        metadata.author = data.get("general.author")
        metadata.version = data.get("general.version")
        metadata.url = data.get("general.url")
        metadata.description = data.get("general.description")
        metadata.license = data.get("general.license")
        metadata.source_url = data.get("general.source.url")
        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
        return metadata
 #
 # vocab
 #
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
@ -1066,21 +1103,42 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
-    def add_meta_arch(self, params: Params) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
-
+        if metadata is not None and metadata.name is not None:
-        # TODO: better logic to determine model name
+            name = metadata.name
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            name = params.path_model.name
        elif params.n_ctx == 4096:
            # Heuristic detection of LLaMA v2 model
            name = "LLaMA v2"
-        self.gguf.add_name                (name)
+        self.gguf.add_name(name)
-        self.gguf.add_vocab_size          (params.n_vocab)
+
-        self.gguf.add_context_length      (params.n_ctx)
+        if metadata is not None:
-        self.gguf.add_embedding_length    (params.n_embd)
+            if metadata.author is not None:
-        self.gguf.add_block_count         (params.n_layer)
+                self.gguf.add_author(metadata.author)
-        self.gguf.add_feed_forward_length (params.n_ff)
+            if metadata.version is not None:
                self.gguf.add_version(metadata.version)
            if metadata.url is not None:
                self.gguf.add_url(metadata.url)
            if metadata.description is not None:
                self.gguf.add_description(metadata.description)
            if metadata.licence is not None:
                self.gguf.add_licence(metadata.licence)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
            if metadata.source_hf_repo is not None:
                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
        self.gguf.add_vocab_size(params.n_vocab)
        self.gguf.add_context_length(params.n_ctx)
        self.gguf.add_embedding_length(params.n_embd)
        self.gguf.add_block_count(params.n_layer)
        self.gguf.add_feed_forward_length(params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
@ -1183,13 +1241,14 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
        # meta data
        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
        of.add_meta_special_vocab(svocab)
@ -1216,12 +1275,14 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
        # meta data
        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")
 def model_parameter_count(model: LazyModel) -> int:
    total_model_parameters = 0
    for i, (name, lazy_tensor) in enumerate(model.items()):
        sum_weights_in_tensor = 1
        for dim in lazy_tensor.shape:
            sum_weights_in_tensor *= dim
        total_model_parameters += sum_weights_in_tensor
    return total_model_parameters
 def model_parameter_count_rounded_notation(model_params_count: int) -> str:
    if model_params_count > 1e12 :
        # Trillions Of Parameters
        scaled_model_params = model_params_count * 1e-12
        scale_suffix = "T"
    elif model_params_count > 1e9 :
        # Billions Of Parameters
        scaled_model_params = model_params_count * 1e-9
        scale_suffix = "B"
    elif model_params_count > 1e6 :
        # Millions Of Parameters
        scaled_model_params = model_params_count * 1e-6
        scale_suffix = "M"
    else:
        # Thousands Of Parameters
        scaled_model_params = model_params_count * 1e-3
        scale_suffix = "K"
    return f"{round(scaled_model_params)}{scale_suffix}"
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}
@ -1436,13 +1528,35 @@ class VocabFactory:
        return vocab, special_vocab
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
-    namestr = {
+    quantization = {
-        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.AllF32:    "F32",
-        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyF16: "F16",
-        GGMLFileType.MostlyQ8_0:"q8_0",
+        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+
    parameters = model_parameter_count_rounded_notation(model_params_count)
    expert_count = ""
    if params.n_experts is not None:
        expert_count = f"{params.n_experts}x"
    version = ""
    if metadata is not None and metadata.version is not None:
        version = f"-{metadata.version}"
    name = "ggml-model"
    if metadata is not None and metadata.name is not None:
        name = metadata.name
    elif params.path_model is not None:
        name = params.path_model.name
    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
            f"Error: Default output path ({ret}) would overwrite the input. "
@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
    args = parser.parse_args(args_in)
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
-    elif args.dump_single or args.dump:
+    elif args.dump_single or args.dump or args.get_outfile:
        # Avoid printing anything besides the dump output
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)
    metadata = Metadata.load(args.metadata)
    if args.get_outfile:
        model_plus = load_some_model(args.model)
        params = Params.load(model_plus)
        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
        model_params_count = model_parameter_count(model_plus.model)
        ftype   = pick_output_type(model, args.outtype)
        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
        return
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")
@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
    model_params_count = model_parameter_count(model_plus.model)
    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
    if args.dump:
        do_dump_model(model_plus)
        return
@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
                f_norm_eps = 1e-5,
            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
        logger.info(f"Wrote {outfile}")
        return
@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")
    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
    logger.info(f"Wrote {outfile}")
--- a/docs/debugging-tests.md
+++ b/docs/debugging-tests.md
@ -1,6 +1,6 @@
 # Debugging Tests Tips
-## How to run & debug a specific test without anything else to keep the feedback loop short?
+## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
 There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
 It will then build & run in the debugger for you.
 To just execute a test and get back a PASS or FAIL message run:
 ```bash
 ./scripts/debug-test.sh test-tokenizer
 ```
 To test in GDB use the `-g` flag to enable gdb test mode.
 ```bash
 ./scripts/debug-test.sh -g test-tokenizer
 # Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
 >>> b main
 ```
 To speed up the testing loop, if you know your test number you can just run it similar to below:
 ```bash
 ./scripts/debug-test.sh test 23
 ```
 For further reference use `debug-test.sh -h` to print help.
 &nbsp;
@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
 make -j
 ```
-#### Step 3.1: Identify Test Command for Debugging
+#### Step 3: Find all tests available that matches REGEX
 The output of this command will give you the command & arguments needed to run GDB.
@ -69,11 +83,13 @@ Labels: main
 ...
 ```
-So for test #1 we can tell these two pieces of relevant information:
+#### Step 4: Identify Test Command for Debugging
 So for test #1 above we can tell these two pieces of relevant information:
 * Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
 * Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
-#### Step 3.2: Run GDB on test command
+#### Step 5: Run GDB on test command
 Based on the ctest 'test command' report above we can then run a gdb session via this command below:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -49,4 +49,7 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
    if (LLAMA_RPC)
        add_subdirectory(rpc)
    endif()
 endif()
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
        params.prompt = "Hello my name is";
    }
-    process_escapes(params.prompt);
+    string_process_escapes(params.prompt);
    // init LLM
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
    params.samples_start_after_nl = false;
    params.use_adam               = true;
-    params.use_flash              = true;
+    params.use_flash              = false;
    params.use_scratch            = true;
    // only adam
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
    // split the prompt into lines
@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
    // clean up
    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
    bool OK = run(ctx, params);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;
-        return ggml_rope_custom(ctx,
+        return ggml_rope_ext(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
+            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
+            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
    }
    sparams.dataset = params.prompt_file;
@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -50,9 +50,9 @@ static void write_logfile(
        return;
    }
-    const std::string timestamp = get_sortable_timestamp();
+    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = create_directory_with_parents(params.logdir);
+    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@ -70,7 +70,7 @@ static void write_logfile(
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@ -78,8 +78,8 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
    const bool add_bos = llama_should_add_bos_token(model);
    GGML_ASSERT(llama_add_eos_token(model) != 1);
@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
                if (params.escape) {
                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    process_escapes(params.input_prefix);
+                    string_process_escapes(params.input_prefix);
-                    process_escapes(params.input_suffix);
+                    string_process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -195,12 +195,12 @@ static const cmd_params cmd_params_defaults = {
    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
-    /* n_pg          */ {{512, 128}},
+    /* n_pg          */ {},
    /* n_batch       */ {2048},
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_math_cpu_count()},
+    /* n_threads     */ {cpu_get_num_math()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@ -7,8 +7,6 @@ android {
    namespace = "com.example.llama"
    compileSdk = 34
    ndkVersion = "26.1.10909125"
    defaultConfig {
        applicationId = "com.example.llama"
        minSdk = 33
@ -20,17 +18,6 @@ android {
        vectorDrawables {
            useSupportLibrary = true
        }
        ndk {
            // Add NDK properties if wanted, e.g.
            // abiFilters += listOf("arm64-v8a")
        }
        externalNativeBuild {
            cmake {
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
            }
        }
    }
    buildTypes {
@ -55,17 +42,6 @@ android {
    composeOptions {
        kotlinCompilerExtensionVersion = "1.5.1"
    }
    packaging {
        resources {
            excludes += "/META-INF/{AL2.0,LGPL2.1}"
        }
    }
    externalNativeBuild {
        cmake {
            path = file("src/main/cpp/CMakeLists.txt")
            version = "3.22.1"
        }
    }
 }
 dependencies {
@ -78,6 +54,7 @@ dependencies {
    implementation("androidx.compose.ui:ui-graphics")
    implementation("androidx.compose.ui:ui-tooling-preview")
    implementation("androidx.compose.material3:material3")
    implementation(project(":llama"))
    testImplementation("junit:junit:4.13.2")
    androidTestImplementation("androidx.test.ext:junit:1.1.5")
    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@ -1,5 +1,6 @@
 package com.example.llama
 import android.llama.cpp.LLamaAndroid
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch
-class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
    companion object {
        @JvmStatic
        private val NanosPerSecond = 1_000_000_000.0
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
        viewModelScope.launch {
            try {
-                llm.unload()
+                llamaAndroid.unload()
            } catch (exc: IllegalStateException) {
                messages += exc.message!!
            }
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
        messages += ""
        viewModelScope.launch {
-            llm.send(text)
+            llamaAndroid.send(text)
                .catch {
                    Log.e(tag, "send() failed", it)
                    messages += it.message!!
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
        viewModelScope.launch {
            try {
                val start = System.nanoTime()
-                val warmupResult = llm.bench(pp, tg, pl, nr)
+                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
                val end = System.nanoTime()
                messages += warmupResult
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
                    return@launch
                }
-                messages += llm.bench(512, 128, 1, 3)
+                messages += llamaAndroid.bench(512, 128, 1, 3)
            } catch (exc: IllegalStateException) {
                Log.e(tag, "bench() failed", exc)
                messages += exc.message!!
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
    fun load(pathToModel: String) {
        viewModelScope.launch {
            try {
-                llm.load(pathToModel)
+                llamaAndroid.load(pathToModel)
                messages += "Loaded $pathToModel"
            } catch (exc: IllegalStateException) {
                Log.e(tag, "load() failed", exc)
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@ -2,4 +2,5 @@
 plugins {
    id("com.android.application") version "8.2.0" apply false
    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
    id("com.android.library") version "8.2.0" apply false
 }
--- a/examples/llama.android/llama/.gitignore
+++ b/examples/llama.android/llama/.gitignore
@ -0,0 +1 @@
 /build
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@ -0,0 +1,55 @@
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
 # Sets the minimum CMake version required for this project.
 cmake_minimum_required(VERSION 3.22.1)
 # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
 # Since this is the top level CMakeLists.txt, the project name is also accessible
 # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
 # build script scope).
 project("llama-android")
 ## Fetch latest llama.cpp from GitHub
 #include(FetchContent)
 #FetchContent_Declare(
 #        llama
 #        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
 #        GIT_TAG        master
 #)
 #
 ## Also provides "common"
 #FetchContent_MakeAvailable(llama)
 # llama.cpp CI uses the code from the current branch
 # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
 add_subdirectory(../../../../../../ build-llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
 # You can define multiple libraries, and CMake builds them for you.
 # Gradle automatically packages shared libraries with your APK.
 #
 # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
    # List C/C++ source files with relative paths to this CMakeLists.txt.
        llama-android.cpp)
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
    # List libraries link to the target library
    llama
    common
    android
    log)
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@ -0,0 +1,68 @@
 plugins {
    id("com.android.library")
    id("org.jetbrains.kotlin.android")
 }
 android {
    namespace = "android.llama.cpp"
    compileSdk = 34
    defaultConfig {
        minSdk = 33
        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
        consumerProguardFiles("consumer-rules.pro")
        ndk {
            // Add NDK properties if wanted, e.g.
            // abiFilters += listOf("arm64-v8a")
        }
        externalNativeBuild {
            cmake {
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
                cppFlags("")
            }
        }
    }
    buildTypes {
        release {
            isMinifyEnabled = false
            proguardFiles(
                getDefaultProguardFile("proguard-android-optimize.txt"),
                "proguard-rules.pro"
            )
        }
    }
    externalNativeBuild {
        cmake {
            path("src/main/cpp/CMakeLists.txt")
            version = "3.22.1"
        }
    }
    compileOptions {
        sourceCompatibility = JavaVersion.VERSION_1_8
        targetCompatibility = JavaVersion.VERSION_1_8
    }
    kotlinOptions {
        jvmTarget = "1.8"
    }
    packaging {
        resources {
            excludes += "/META-INF/{AL2.0,LGPL2.1}"
        }
    }
 }
 dependencies {
    implementation("androidx.core:core-ktx:1.12.0")
    implementation("androidx.appcompat:appcompat:1.6.1")
    implementation("com.google.android.material:material:1.11.0")
    testImplementation("junit:junit:4.13.2")
    androidTestImplementation("androidx.test.ext:junit:1.1.5")
    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
 }
--- a/examples/llama.android/llama/consumer-rules.pro
+++ b/examples/llama.android/llama/consumer-rules.pro
--- a/examples/llama.android/llama/proguard-rules.pro
+++ b/examples/llama.android/llama/proguard-rules.pro
@ -0,0 +1,21 @@
 # Add project specific ProGuard rules here.
 # You can control the set of applied configuration files using the
 # proguardFiles setting in build.gradle.
 #
 # For more details, see
 #   http://developer.android.com/guide/developing/tools/proguard.html
 # If your project uses WebView with JS, uncomment the following
 # and specify the fully qualified class name to the JavaScript interface
 # class:
 #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
 #   public *;
 #}
 # Uncomment this to preserve the line number information for
 # debugging stack traces.
 #-keepattributes SourceFile,LineNumberTable
 # If you keep the line number information, uncomment this to
 # hide the original source file name.
 #-renamesourcefileattribute SourceFile
--- a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@ -0,0 +1,24 @@
 package android.llama.cpp
 import androidx.test.platform.app.InstrumentationRegistry
 import androidx.test.ext.junit.runners.AndroidJUnit4
 import org.junit.Test
 import org.junit.runner.RunWith
 import org.junit.Assert.*
 /**
 * Instrumented test, which will execute on an Android device.
 *
 * See [testing documentation](http://d.android.com/tools/testing).
 */
@RunWith(AndroidJUnit4::class)
 class ExampleInstrumentedTest {
    @Test
    fun useAppContext() {
        // Context of the app under test.
        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
        assertEquals("android.llama.cpp.test", appContext.packageName)
    }
 }
--- a/examples/llama.android/llama/src/main/AndroidManifest.xml
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android">
 </manifest>
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@ -1,4 +1,3 @@
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
@ -36,15 +35,15 @@ FetchContent_MakeAvailable(llama)
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
+        # List C/C++ source files with relative paths to this CMakeLists.txt.
-    llama-android.cpp)
+        llama-android.cpp)
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
+        # List libraries link to the target library
-    llama
+        llama
-    common
+        common
-    android
+        android
-    log)
+        log)
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
    llama_model_params model_params = llama_model_default_params();
    auto path_to_model = env->GetStringUTFChars(filename, 0);
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
+Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
    llama_free_model(reinterpret_cast<llama_model *>(model));
 }
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
    auto model = reinterpret_cast<llama_model *>(jmodel);
    if (!model) {
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
    llama_free(reinterpret_cast<llama_context *>(context));
 }
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
    llama_backend_free();
 }
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
    llama_log_set(log_callback, NULL);
 }
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_bench_1model(
+Java_android_llama_cpp_LLamaAndroid_bench_1model(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
 }
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
    llama_backend_init();
 }
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
    return env->NewStringUTF(llama_print_system_info());
 }
 extern "C"
 JNIEXPORT jint JNICALL
-Java_com_example_llama_Llm_completion_1init(
+Java_android_llama_cpp_LLamaAndroid_completion_1init(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_completion_1loop(
+Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
 }
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@ -1,4 +1,4 @@
-package com.example.llama
+package android.llama.cpp
 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread
-class Llm {
+class LLamaAndroid {
    private val tag: String? = this::class.simpleName
    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@ -165,8 +165,8 @@ class Llm {
        }
        // Enforce only one instance of Llm.
-        private val _instance: Llm = Llm()
+        private val _instance: LLamaAndroid = LLamaAndroid()
-        fun instance(): Llm = _instance
+        fun instance(): LLamaAndroid = _instance
    }
 }
--- a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@ -0,0 +1,17 @@
 package android.llama.cpp
 import org.junit.Test
 import org.junit.Assert.*
 /**
 * Example local unit test, which will execute on the development machine (host).
 *
 * See [testing documentation](http://d.android.com/tools/testing).
 */
 class ExampleUnitTest {
    @Test
    fun addition_isCorrect() {
        assertEquals(4, 2 + 2)
    }
 }
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@ -15,3 +15,4 @@ dependencyResolutionManagement {
 rootProject.name = "LlamaAndroid"
 include(":app")
 include(":llama")
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_print_usage(argc, argv, params);
+        gpt_params_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
@ -300,14 +300,10 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    for (auto & image : params.image) {
+    if (prompt_contains_image(params.prompt)) {
        auto ctx_llava = llava_init_context(&params, model);
-        auto image_embed = load_image(ctx_llava, &params, image);
+        auto image_embed = load_image(ctx_llava, &params, "");
        if (!image_embed) {
            std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
            return 1;
        }
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);
@ -316,7 +312,26 @@ int main(int argc, char ** argv) {
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
    } else {
        for (auto & image : params.image) {
            auto ctx_llava = llava_init_context(&params, model);
            auto image_embed = load_image(ctx_llava, &params, image);
            if (!image_embed) {
                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
                return 1;
            }
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);
            llama_print_timings(ctx_llava->ctx_llama);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
        }
    }
    llama_free_model(model);
    return 0;
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
        struct ggml_tensor * newline;
        struct ggml_context * ctx;
    } model;
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    model.ctx = ggml_init(params);
    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
        if (newline_tmp->buffer == NULL) {
            LOG_TEE("newline_tmp tensor buffer is NULL\n");
        }
        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
    } else {
        model.newline->data = newline_tmp->data;
        if (model.newline->data == NULL) {
            LOG_TEE("newline_tmp tensor data is NULL\n");
        }
    }
    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
        // debug
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
        }
        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -121,7 +121,7 @@ int main(int argc, char ** argv){
        // debug
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
        }
        // print current draft sequence
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -60,9 +60,9 @@ static void write_logfile(
        return;
    }
-    const std::string timestamp = get_sortable_timestamp();
+    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = create_directory_with_parents(params.logdir);
+    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@ -80,7 +80,7 @@ static void write_logfile(
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@ -88,8 +88,8 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
    }
    LOG("%s: llama backend init\n", __func__);
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
    std::string path_session = params.path_prompt_cache;
@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");
    if (params.interactive) {
-        const char *control_message;
+        const char * control_message;
        if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to the AI, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to the AI.\n"
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@ -740,18 +740,32 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
+                const std::string token_str = llama_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());
                // Console/Stream Output
                if (!llama_token_is_control(llama_get_model(ctx), id)) {
                    // Stream Output Token To Standard Output
                    fprintf(stdout, "%s", token_str.c_str());
                } else if (!params.no_special && !params.conversation) {
                    // Stream Control Token To Standard Output Stream
                    fprintf(stdout, "%s", token_str.c_str());
                }
                // Record Displayed Tokens To Log
                // Note: Generated tokens are created one by one hence this check
                if (embd.size() > 1) {
                    // Incoming Requested Tokens
                    input_tokens.push_back(id);
                } else {
                    // Outgoing Generated Tokens
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
                fflush(stdout);
            }
            fflush(stdout);
        }
        // reset color to default if there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
@ -879,7 +893,7 @@ int main(int argc, char ** argv) {
                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
                    }
                    if (params.escape) {
-                        process_escapes(buffer);
+                        string_process_escapes(buffer);
                    }
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
    while (true) {
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
        }
        llama_batch_clear(batch);
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou
 Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
 The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
 When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
 llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
 By default only the mean perplexity value and the corresponding uncertainty is calculated.
 The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
@ -32,12 +34,21 @@ In addition to the KL divergence the following statistics are calculated with `-
 ## LLaMA 3 8b Scoreboard
-Results are sorted by Kullback-Leibler divergence relative to FP16.
+| Revision | f364eb6f           |
 |:---------|:-------------------|
 | Backend  | CUDA               |
 | CPU      | AMD Epyc 7742      |
 | GPU      | 1x NVIDIA RTX 4090 |
 Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
 The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
 Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
 In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
 So the "f16" results are to be understood as the difference resulting only from this downcast.
 | Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
 |--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
-| f16          | None    |            14.97 | 6.233160 ±   0.037828  | -                      | -                     | -                 | -                |
+| f16          | None    |            14.97 | 6.233160 ±   0.037828  | 0.001524 ±   0.000755  | 0.000551 ±   0.000002 |  0.001 ± 0.002 %  | 0.787 ± 0.004 %  |
 | q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
 | q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
 | q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
@ -89,6 +100,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 ## LLaMA 2 vs. LLaMA 3 Quantization comparison
 | Revision | f364eb6f           |
 |:---------|:-------------------|
 | Backend  | CUDA               |
 | CPU      | AMD Epyc 7742      |
 | GPU      | 1x NVIDIA RTX 4090 |
 | Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
 |-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
 | Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
@ -107,6 +124,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 | RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
 | Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
 ## LLaMA 3 BF16 vs. FP16 comparison
 | Revision | 83330d8c      |
 |:---------|:--------------|
 | Backend  | CPU           |
 | CPU      | AMD Epyc 7742 |
 | GPU      | N/A           |
 Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
 | Metric                         |                    Value |
 |--------------------------------|--------------------------|
 | Mean PPL(Q)                    |      6.227711 ± 0.037833 |
 | Mean PPL(base)                 |      6.225194 ± 0.037771 |
 | Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
 | Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
 | Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
 | Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
 | Mean    KLD                    |  0.00002515 ± 0.00000020 |
 | Maximum KLD                    |                 0.012206 |
 | 99.9%   KLD                    |                 0.000799 |
 | 99.0%   KLD                    |                 0.000222 |
 | 99.0%   KLD                    |                 0.000222 |
 | Median  KLD                    |                 0.000013 |
 | 10.0%   KLD                    |                -0.000002 |
 | 5.0%   KLD                     |                -0.000008 |
 | 1.0%   KLD                     |                -0.000023 |
 | Minimum KLD                    |                -0.000059 |
 | Mean    Δp                     | -0.0000745 ± 0.0003952 % |
 | Maximum Δp                     |                   4.186% |
 | 99.9%   Δp                     |                   1.049% |
 | 99.0%   Δp                     |                   0.439% |
 | 95.0%   Δp                     |                   0.207% |
 | 90.0%   Δp                     |                   0.125% |
 | 75.0%   Δp                     |                   0.029% |
 | Median  Δp                     |                   0.000% |
 | 25.0%   Δp                     |                  -0.030% |
 | 10.0%   Δp                     |                  -0.126% |
 | 5.0%   Δp                      |                  -0.207% |
 | 1.0%   Δp                      |                  -0.434% |
 | 0.1%   Δp                      |                  -1.016% |
 | Minimum Δp                     |                  -4.672% |
 | RMS Δp                         |          0.150 ± 0.001 % |
 | Same top p                     |         99.739 ± 0.013 % |
 ## Old Numbers
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -44,9 +44,9 @@ static void write_logfile(
        return;
    }
-    const std::string timestamp = get_sortable_timestamp();
+    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = create_directory_with_parents(params.logdir);
+    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@ -64,7 +64,7 @@ static void write_logfile(
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
+    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@ -72,9 +72,9 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
-    dump_vector_float_yaml(logfile, "logits", results.logits);
+    yaml_dump_vector_float(logfile, "logits", results.logits);
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    dump_vector_float_yaml(logfile, "probs", results.probs);
+    yaml_dump_vector_float(logfile, "probs", results.probs);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // Use all tasks
        tasks.resize(n_task);
        printf("%s: reading tasks", __func__);
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
        int i = 0;
        for (auto& task : tasks) {
            ++i;
@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    llama_batch_free(batch);
-    if (n_done < 100) return;
+    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
    struct results_perplexity results;
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -1,6 +1,8 @@
 # quantize
-TODO
+You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
 Note: It is synced from llama.cpp `main` every 6 hours.
 ## Llama 2 7B
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
-            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
+            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
-        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
            params.keep_split = true;
        } else {
            usage(argv[0]);
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@ -41,8 +41,8 @@ $SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/g
 echo PASS
 echo
-# 3. Requant model with '--keep_split'
+# 3. Requant model with '--keep-split'
-$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+$QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
 echo PASS
 echo
@ -51,7 +51,7 @@ $MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt
 echo PASS
 echo
-# 4. Requant mode without '--keep_split'
+# 4. Requant mode without '--keep-split'
 $QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
 echo PASS
 echo
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -11,7 +11,7 @@ struct retrieval_params {
 };
 static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
-    gpt_print_usage(argc, argv, gpt_params);
+    gpt_params_print_usage(argc, argv, gpt_params);
    printf("retrieval options:\n");
    printf("  --context-file FNAME  file containing context to embed.\n");
    printf("                        specify multiple files by providing --context-file option multiple times.\n");
@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
    // max batch size
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@ -0,0 +1,2 @@
 add_executable(rpc-server rpc-server.cpp)
 target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -0,0 +1,74 @@
 ## Overview
 The `rpc-server` allows  running `ggml` backend on a remote host.
 The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
 This can be used for distributed LLM inference with `llama.cpp` in the following way:
 ```mermaid
 flowchart TD
    rpcb---|TCP|srva
    rpcb---|TCP|srvb
    rpcb-.-|TCP|srvn
    subgraph hostn[Host N]
    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hostb[Host B]
    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hosta[Host A]
    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
    end
    subgraph host[Main Host]
    ggml[llama.cpp]---rpcb[RPC backend]
    end
    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
 Each host can run a different backend, e.g. one with CUDA and another with Metal.
 You can also run multiple `rpc-server` instances on the same host, each with a different backend.
 ## Usage
 On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
 For example, to build the CUDA backend with RPC support:
 ```bash
 mkdir build-rpc-cuda
 cd build-rpc-cuda
 cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
 cmake --build . --config Release
 ```
 Then, start the `rpc-server` with the backend:
 ```bash
 $ bin/rpc-server -p 50052
 create_backend: using CUDA backend
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
 ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
 ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
 Starting RPC server on 0.0.0.0:50052
 ```
 When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
 ```bash
 $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 ```
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
 On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
 ```bash
 mkdir build-rpc
 cd build-rpc
 cmake .. -DLLAMA_RPC=ON
 cmake --build . --config Release
 ```
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
 ```bash
 $ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -0,0 +1,134 @@
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
 #else
 #  include <unistd.h>
 #endif
 #include <string>
 #include <stdio.h>
 struct rpc_server_params {
    std::string host        = "0.0.0.0";
    int         port        = 50052;
    size_t      backend_mem = 0;
 };
 static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
    fprintf(stderr, "\n");
 }
 static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
    std::string arg;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg == "-H" || arg == "--host") {
            if (++i >= argc) {
                return false;
            }
            params.host = argv[i];
        } else if (arg == "-p" || arg == "--port") {
            if (++i >= argc) {
                return false;
            }
            params.port = std::stoi(argv[i]);
            if (params.port <= 0 || params.port > 65535) {
                return false;
            }
        } else if (arg == "-m" || arg == "--mem") {
            if (++i >= argc) {
                return false;
            }
            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
        } else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 static ggml_backend_t create_backend() {
    ggml_backend_t backend = NULL;
 #ifdef GGML_USE_CUDA
    fprintf(stderr, "%s: using CUDA backend\n", __func__);
    backend = ggml_backend_cuda_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
    }
 #elif GGML_USE_METAL
    fprintf(stderr, "%s: using Metal backend\n", __func__);
    backend = ggml_backend_metal_init();
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
    if (!backend) {
        fprintf(stderr, "%s: using CPU backend\n", __func__);
        backend = ggml_backend_cpu_init();
    }
    return backend;
 }
 static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 #ifdef GGML_USE_CUDA
    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #else
    #ifdef _WIN32
        MEMORYSTATUSEX status;
        status.dwLength = sizeof(status);
        GlobalMemoryStatusEx(&status);
        *total_mem = status.ullTotalPhys;
        *free_mem = status.ullAvailPhys;
    #else
        long pages = sysconf(_SC_PHYS_PAGES);
        long page_size = sysconf(_SC_PAGE_SIZE);
        *total_mem = pages * page_size;
        *free_mem = *total_mem;
    #endif
 #endif
 }
 int main(int argc, char * argv[]) {
    rpc_server_params params;
    if (!rpc_server_params_parse(argc, argv, params)) {
        fprintf(stderr, "Invalid parameters\n");
        return 1;
    }
    ggml_backend_t backend = create_backend();
    if (!backend) {
        fprintf(stderr, "Failed to create backend\n");
        return 1;
    }
    std::string endpoint = params.host + ":" + std::to_string(params.port);
    size_t free_mem, total_mem;
    if (params.backend_mem > 0) {
        free_mem = params.backend_mem;
        total_mem = params.backend_mem;
    } else {
        get_backend_memory(&free_mem, &total_mem);
    }
    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
    ggml_backend_free(backend);
    return 0;
 }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -17,8 +17,9 @@ The project is under active development, and we are [looking for feedback and co
 **Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
+- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
+- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
 - `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
 - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
+- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
 page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
 - `--numa`: Attempt optimizations that may help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -48,8 +47,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `--path`: Path from which to serve static files. Default: disabled
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled
+- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
--- a/examples/server/public_simplechat/index.html
+++ b/examples/server/public_simplechat/index.html
@ -0,0 +1,52 @@
 <!DOCTYPE html>
 <html lang="en">
    <head>
        <title>SimpleChat (LlamaCPP, ...) </title>
        <meta charset="UTF-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <meta name="message" content="Save Nature Save Earth" />
        <meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
        <meta name="author" content="by Humans for All" />
        <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
        <script src="simplechat.js" defer></script>
        <link rel="stylesheet" href="simplechat.css" />
    </head>
    <body>
        <div class="samecolumn" id="fullbody">
            <div class="sameline">
                <p class="heading flex-grow" > <b> SimpleChat </b> </p>
                <div class="sameline">
                    <label for="api-ep">Mode:</label>
                    <select name="api-ep" id="api-ep">
                    <option value="chat" selected>Chat</option>
                    <option value="completion">Completion</option>
                    </select>
                </div>
            </div>
            <div id="sessions-div" class="sameline"></div>
            <hr>
            <div class="sameline">
                <label for="system-in">System</label>
                <input type="text" name="system" id="system-in" class="flex-grow"/>
            </div>
            <hr>
            <div id="chat-div">
                <p> Enter the system prompt above, before entering/submitting any user query.</p>
                <p> Enter your text to the ai assistant below.</p>
                <p> Use shift+enter for inserting enter.</p>
                <p> Refresh the page to start over fresh.</p>
            </div>
            <hr>
            <div class="sameline">
                <textarea id="user-in" class="flex-grow" rows="3"></textarea>
                <button id="user-btn">submit</button>
            </div>
        </div>
    </body>
 </html>
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -0,0 +1,81 @@
 # SimpleChat
 by Humans for All.
 ## overview
 This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
 in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
 multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
 own system prompts.
 The UI follows a responsive web design so that the layout can adapt to available display space in a usable
 enough manner, in general.
 NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
 culling of old messages from the chat.
 NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
 the js file as needed.
 ## usage
 One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
 frontend to configure the server over http(s) or so, then run this web frontend using something like python's
 http module.
 ### running using examples/server
 bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
 ### running using python3's server module
 first run examples/server
 * bin/server -m path/model.gguf
 next run this web front end in examples/server/public_simplechat
 * cd ../examples/server/public_simplechat
 * python3 -m http.server PORT
 ### using the front end
 Open this simple web front end from your local browser
 * http://127.0.0.1:PORT/index.html
 Once inside
 * Select between chat and completion mode. By default it is set to chat mode.
 * If you want to provide a system prompt, then ideally enter it first, before entering any user query.
  * if chat.add_system_begin is used
    * you cant change the system prompt, after it is has been submitted once along with user query.
    * you cant set a system prompt, after you have submitted any user query
  * if chat.add_system_anytime is used
    * one can change the system prompt any time during chat, by changing the contents of system prompt.
    * inturn the updated/changed system prompt will be inserted into the chat session.
    * this allows for the subsequent user chatting to be driven by the new system prompt set above.
 * Enter your query and either press enter or click on the submit button.
  If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
 * Wait for the logic to communicate with the server and get the response.
  * the user is not allowed to enter any fresh query during this time.
  * the user input box will be disabled and a working message will be shown in it.
 * just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
 * Using NewChat one can start independent chat sessions.
  * two independent chat sessions are setup by default.
 ## Devel note
 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
 may not be visible. Also remember that just refreshing/reloading page in browser or for that
 matter clearing site data, dont directly override site caching in all cases. Worst case you may
 have to change port. Or in dev tools of browser, you may be able to disable caching fully.
 Concept of multiple chat sessions with different servers, as well as saving and restoring of
 those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
 its instances relatively easily, however given the current goal of keeping this simple, it has
 not been added, for now.
 By switching between chat.add_system_begin/anytime, one can control whether one can change
 the system prompt, anytime during the conversation or only at the beginning.
--- a/examples/server/public_simplechat/simplechat.css
+++ b/examples/server/public_simplechat/simplechat.css
@ -0,0 +1,61 @@
 /**
 * the styling of the simplechat web frontend
 * by Humans for All
 */
 #fullbody {
    height: 98vh;
 }
 .heading {
    background-color: lightgray;
 }
 .session-selected {
    background-color: lightblue;
 }
 .role-system {
    background-color: lightblue;
 }
 .role-user {
    background-color: lightgray;
 }
 .flex-grow {
    flex-grow: 1;
 }
 .float-right {
    float: right;
 }
 #chat-div {
    overflow: scroll;
    flex-grow: 1;
    flex-shrink: 1;
    min-height: 40vh;
 }
 button {
    min-width: 8vw;
 }
 .sameline {
    display: flex;
    flex-direction: row;
 }
 .samecolumn {
    display: flex;
    flex-direction: column;
 }
 * {
    margin: 0.6vmin;
 }
@media print {
    #fullbody {
        height: auto;
    }
 }
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -0,0 +1,478 @@
 // @ts-check
 // A simple completions and chat/completions test related web front end logic
 // by Humans for All
 class Roles {
    static System = "system";
    static User = "user";
    static Assistant = "assistant";
 }
 class ApiEP {
    static Chat = "chat";
    static Completion = "completion";
 }
 let gUsageMsg = `
    <p> Enter the system prompt above, before entering/submitting any user query.</p>
    <p> Enter your text to the ai assistant below.</p>
    <p> Use shift+enter for inserting enter.</p>
    <p> Refresh the page to start over fresh.</p>
 `;
 class SimpleChat {
    constructor() {
        /**
         * Maintain in a form suitable for common LLM web service chat/completions' messages entry
         * @type {{role: string, content: string}[]}
         */
        this.xchat = [];
        this.iLastSys = -1;
    }
    /**
     * Add an entry into xchat
     * @param {string} role
     * @param {string|undefined|null} content
     */
    add(role, content) {
        if ((content == undefined) || (content == null) || (content == "")) {
            return false;
        }
        this.xchat.push( {role: role, content: content} );
        if (role == Roles.System) {
            this.iLastSys = this.xchat.length - 1;
        }
        return true;
    }
    /**
     * Show the contents in the specified div
     * @param {HTMLDivElement} div
     * @param {boolean} bClear
     */
    show(div, bClear=true) {
        if (bClear) {
            div.replaceChildren();
        }
        let last = undefined;
        for(const x of this.xchat) {
            let entry = document.createElement("p");
            entry.className = `role-${x.role}`;
            entry.innerText = `${x.role}: ${x.content}`;
            div.appendChild(entry);
            last = entry;
        }
        if (last !== undefined) {
            last.scrollIntoView(false);
        } else {
            if (bClear) {
                div.innerHTML = gUsageMsg;
            }
        }
    }
    /**
     * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
     * Convert the json into string.
     * @param {Object} obj
     */
    request_jsonstr(obj) {
        obj["temperature"] = 0.7;
        return JSON.stringify(obj);
    }
    /**
     * Return a string form of json object suitable for chat/completions
     */
    request_messages_jsonstr() {
        let req = {
            messages: this.xchat,
        }
        return this.request_jsonstr(req);
    }
    /**
     * Return a string form of json object suitable for /completions
     */
    request_prompt_jsonstr() {
        let prompt = "";
        for(const chat of this.xchat) {
            prompt += `${chat.role}: ${chat.content}\n`;
        }
        let req = {
            prompt: prompt,
        }
        return this.request_jsonstr(req);
    }
    /**
     * Allow setting of system prompt, but only at begining.
     * @param {string} sysPrompt
     * @param {string} msgTag
     */
    add_system_begin(sysPrompt, msgTag) {
        if (this.xchat.length == 0) {
            if (sysPrompt.length > 0) {
                return this.add(Roles.System, sysPrompt);
            }
        } else {
            if (sysPrompt.length > 0) {
                if (this.xchat[0].role !== Roles.System) {
                    console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
                } else {
                    if (this.xchat[0].content !== sysPrompt) {
                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
                    }
                }
            }
        }
        return false;
    }
    /**
     * Allow setting of system prompt, at any time.
     * @param {string} sysPrompt
     * @param {string} msgTag
     */
    add_system_anytime(sysPrompt, msgTag) {
        if (sysPrompt.length <= 0) {
            return false;
        }
        if (this.iLastSys < 0) {
            return this.add(Roles.System, sysPrompt);
        }
        let lastSys = this.xchat[this.iLastSys].content;
        if (lastSys !== sysPrompt) {
            return this.add(Roles.System, sysPrompt);
        }
        return false;
    }
    /**
     * Retrieve the latest system prompt.
     */
    get_system_latest() {
        if (this.iLastSys == -1) {
            return "";
        }
        let sysPrompt = this.xchat[this.iLastSys].content;
        return sysPrompt;
    }
 }
 let gBaseURL = "http://127.0.0.1:8080";
 let gChatURL = {
    'chat': `${gBaseURL}/chat/completions`,
    'completion': `${gBaseURL}/completions`,
 }
 const gbCompletionFreshChatAlways = true;
 /**
 * Set the class of the children, based on whether it is the idSelected or not.
 * @param {HTMLDivElement} elBase
 * @param {string} idSelected
 * @param {string} classSelected
 * @param {string} classUnSelected
 */
 function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
    for(let child of elBase.children) {
        if (child.id == idSelected) {
            child.className = classSelected;
        } else {
            child.className = classUnSelected;
        }
    }
 }
 /**
 * Create button and set it up.
 * @param {string} id
 * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
 * @param {string | undefined} name
 * @param {string | undefined} innerText
 */
 function el_create_button(id, callback, name=undefined, innerText=undefined) {
    if (!name) {
        name = id;
    }
    if (!innerText) {
        innerText = id;
    }
    let btn = document.createElement("button");
    btn.id = id;
    btn.name = name;
    btn.innerText = innerText;
    btn.addEventListener("click", callback);
    return btn;
 }
 class MultiChatUI {
    constructor() {
        /** @type {Object<string, SimpleChat>} */
        this.simpleChats = {};
        /** @type {string} */
        this.curChatId = "";
        // the ui elements
        this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
        this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
        this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
        this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
        this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
        this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
        this.validate_element(this.elInSystem, "system-in");
        this.validate_element(this.elDivChat, "chat-div");
        this.validate_element(this.elInUser, "user-in");
        this.validate_element(this.elSelectApiEP, "api-ep");
        this.validate_element(this.elDivChat, "sessions-div");
    }
    /**
     * Check if the element got
     * @param {HTMLElement | null} el
     * @param {string} msgTag
     */
    validate_element(el, msgTag) {
        if (el == null) {
            throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
        } else {
            console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
        }
    }
    /**
     * Reset user input ui.
     * * clear user input
     * * enable user input
     * * set focus to user input
     */
    ui_reset_userinput() {
        this.elInUser.value = "";
        this.elInUser.disabled = false;
        this.elInUser.focus();
    }
    /**
     * Setup the needed callbacks wrt UI, curChatId to defaultChatId and
     * optionally switch to specified defaultChatId.
     * @param {string} defaultChatId
     * @param {boolean} bSwitchSession
     */
    setup_ui(defaultChatId, bSwitchSession=false) {
        this.curChatId = defaultChatId;
        if (bSwitchSession) {
            this.handle_session_switch(this.curChatId);
        }
        this.elBtnUser.addEventListener("click", (ev)=>{
            if (this.elInUser.disabled) {
                return;
            }
            this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
                let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
                console.debug(msg.replace("\n", ":"));
                alert(msg);
                this.ui_reset_userinput();
            });
        });
        this.elInUser.addEventListener("keyup", (ev)=> {
            // allow user to insert enter into their message using shift+enter.
            // while just pressing enter key will lead to submitting.
            if ((ev.key === "Enter") && (!ev.shiftKey)) {
                this.elBtnUser.click();
                ev.preventDefault();
            }
        });
        this.elInSystem.addEventListener("keyup", (ev)=> {
            // allow user to insert enter into the system prompt using shift+enter.
            // while just pressing enter key will lead to setting the system prompt.
            if ((ev.key === "Enter") && (!ev.shiftKey)) {
                let chat = this.simpleChats[this.curChatId];
                chat.add_system_anytime(this.elInSystem.value, this.curChatId);
                chat.show(this.elDivChat);
                ev.preventDefault();
            }
        });
    }
    /**
     * Setup a new chat session and optionally switch to it.
     * @param {string} chatId
     * @param {boolean} bSwitchSession
     */
    new_chat_session(chatId, bSwitchSession=false) {
        this.simpleChats[chatId] = new SimpleChat();
        if (bSwitchSession) {
            this.handle_session_switch(chatId);
        }
    }
    /**
     * Handle user query submit request, wrt specified chat session.
     * @param {string} chatId
     * @param {string} apiEP
     */
    async handle_user_submit(chatId, apiEP) {
        let chat = this.simpleChats[chatId];
        chat.add_system_anytime(this.elInSystem.value, chatId);
        let content = this.elInUser.value;
        if (!chat.add(Roles.User, content)) {
            console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
            return;
        }
        chat.show(this.elDivChat);
        let theBody;
        let theUrl = gChatURL[apiEP]
        if (apiEP == ApiEP.Chat) {
            theBody = chat.request_messages_jsonstr();
        } else {
            theBody = chat.request_prompt_jsonstr();
        }
        this.elInUser.value = "working...";
        this.elInUser.disabled = true;
        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
        let resp = await fetch(theUrl, {
            method: "POST",
            headers: {
                "Content-Type": "application/json",
            },
            body: theBody,
        });
        let respBody = await resp.json();
        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
        let assistantMsg;
        if (apiEP == ApiEP.Chat) {
            assistantMsg = respBody["choices"][0]["message"]["content"];
        } else {
            try {
                assistantMsg = respBody["choices"][0]["text"];
            } catch {
                assistantMsg = respBody["content"];
            }
        }
        chat.add(Roles.Assistant, assistantMsg);
        if (chatId == this.curChatId) {
            chat.show(this.elDivChat);
        } else {
            console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
        }
        // Purposefully clear at end rather than begin of this function
        // so that one can switch from chat to completion mode and sequece
        // in a completion mode with multiple user-assistant chat data
        // from before to be sent/occur once.
        if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
            chat.xchat.length = 0;
        }
        this.ui_reset_userinput();
    }
    /**
     * Show buttons for NewChat and available chat sessions, in the passed elDiv.
     * If elDiv is undefined/null, then use this.elDivSessions.
     * Take care of highlighting the selected chat-session's btn.
     * @param {HTMLDivElement | undefined} elDiv
     */
    show_sessions(elDiv=undefined) {
        if (!elDiv) {
            elDiv = this.elDivSessions;
        }
        elDiv.replaceChildren();
        // Btn for creating new chat session
        let btnNew = el_create_button("New CHAT", (ev)=> {
            if (this.elInUser.disabled) {
                console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
                alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
                return;
            }
            let chatId = `Chat${Object.keys(this.simpleChats).length}`;
            let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
            if (!chatIdGot) {
                console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
                return;
            }
            this.new_chat_session(chatIdGot, true);
            this.create_session_btn(elDiv, chatIdGot);
            el_children_config_class(elDiv, chatIdGot, "session-selected", "");
        });
        elDiv.appendChild(btnNew);
        // Btns for existing chat sessions
        let chatIds = Object.keys(this.simpleChats);
        for(let cid of chatIds) {
            let btn = this.create_session_btn(elDiv, cid);
            if (cid == this.curChatId) {
                btn.className = "session-selected";
            }
        }
    }
    create_session_btn(elDiv, cid) {
        let btn = el_create_button(cid, (ev)=>{
            let target = /** @type{HTMLButtonElement} */(ev.target);
            console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
            if (this.elInUser.disabled) {
                console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
                alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
                return;
            }
            this.handle_session_switch(target.id);
            el_children_config_class(elDiv, target.id, "session-selected", "");
        });
        elDiv.appendChild(btn);
        return btn;
    }
    /**
     * Switch ui to the specified chatId and set curChatId to same.
     * @param {string} chatId
     */
    async handle_session_switch(chatId) {
        let chat = this.simpleChats[chatId];
        if (chat == undefined) {
            console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
            return;
        }
        this.elInSystem.value = chat.get_system_latest();
        this.elInUser.value = "";
        chat.show(this.elDivChat);
        this.elInUser.focus();
        this.curChatId = chatId;
        console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
    }
 }
 let gMuitChat;
 const gChatIds = [ "Default", "Other" ];
 function startme() {
    console.log("INFO:SimpleChat:StartMe:Starting...");
    gMuitChat = new MultiChatUI();
    for (let cid of gChatIds) {
        gMuitChat.new_chat_session(cid);
    }
    gMuitChat.setup_ui(gChatIds[0]);
    gMuitChat.show_sessions();
 }
 document.addEventListener("DOMContentLoaded", startme);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -102,7 +102,6 @@ struct slot_params {
    bool stream       = true;
    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t  n_predict = -1; // new tokens to predict
@ -671,6 +670,13 @@ struct server_context {
            model = nullptr;
        }
        // Clear any sampling context
        for (server_slot & slot : slots) {
            if (slot.ctx_sampling != nullptr) {
                llama_sampling_free(slot.ctx_sampling);
            }
        }
        llama_batch_free(batch);
    }
@ -1013,7 +1019,7 @@ struct server_context {
                        sampler_names.emplace_back(sampler_name);
                    }
                }
-                slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
            } else {
                slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
            }
@ -1250,14 +1256,14 @@ struct server_context {
        std::vector<std::string> samplers_sequence;
        samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
        for (const auto & sampler_type : slot.sparams.samplers_sequence) {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }
        return json {
            {"n_ctx",                     slot.n_ctx},
            {"n_predict",                 slot.n_predict},
            {"model",                     params.model_alias},
-            {"seed",                      slot.params.seed},
+            {"seed",                      slot.sparams.seed},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@ -1975,8 +1981,7 @@ struct server_context {
                                slot.state = SLOT_STATE_PROCESSING;
                                slot.command = SLOT_COMMAND_NONE;
                                slot.release();
-                                slot.print_timings();
+                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                send_final_response(slot);
                                continue;
                            }
                        } else {
@ -2380,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
    printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
    printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
    printf("  --rpc SERVERS             comma separated list of RPC servers\n");
    printf("  --path PUBLIC_PATH        path from which to serve static files (default: disabled)\n");
    printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
    printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@ -2432,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            sparams.port = std::stoi(argv[i]);
        } else if (arg == "--rpc") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rpc_servers = argv[i];
        } else if (arg == "--host") {
            if (++i >= argc) {
                invalid_param = true;
@ -2840,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-            if (!parse_kv_override(argv[i], params.kv_overrides)) {
+            if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
@ -3298,7 +3310,7 @@ int main(int argc, char ** argv) {
    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
        std::string filename = request_data.at("filename");
-        if (!validate_file_name(filename)) {
+        if (!fs_validate_filename(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
        }
@ -3328,7 +3340,7 @@ int main(int argc, char ** argv) {
    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
        std::string filename = request_data.at("filename");
-        if (!validate_file_name(filename)) {
+        if (!fs_validate_filename(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
        }
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@ -13,6 +13,7 @@ Feature: Results
  Scenario Outline: consistent results with same seed
    Given <n_slots> slots
    And   1.0 temperature
    Then  the server is starting
    Then  the server is healthy
@ -26,10 +27,12 @@ Feature: Results
    Examples:
      | n_slots |
      | 1       |
-      | 2       |
+      # FIXME: unified KV cache nondeterminism
      # | 2       |
  Scenario Outline: different results with different seed
    Given <n_slots> slots
    And   1.0 temperature
    Then  the server is starting
    Then  the server is healthy
@ -70,12 +73,46 @@ Feature: Results
    Then all predictions are equal
    Examples:
      | n_parallel | temp |
-      |  1         | 0.0  |
+      | 1          | 0.0  |
-      |  2         | 0.0  |
+      | 1          | 1.0  |
-      |  4         | 0.0  |
+      # FIXME: unified KV cache nondeterminism
      |  1         | 1.0  |
      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # |  2         | 1.0  |
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # |  4         | 1.0  |
+      # | 2          | 0.0  |
      # | 4          | 0.0  |
      # | 2          | 1.0  |
      # | 4          | 1.0  |
  Scenario Outline: consistent token probs with same seed and prompt
    Given <n_slots> slots
    And   <n_kv> KV cache size
    And   1.0 temperature
    And   <n_predict> max tokens to predict
    Then  the server is starting
    Then  the server is healthy
    Given 1 prompts "The meaning of life is" with seed 42
    And   concurrent completion requests
    # Then the server is busy # Not all slots will be utilized.
    Then  the server is idle
    And   all slots are idle
    Given <n_parallel> prompts "The meaning of life is" with seed 42
    And   concurrent completion requests
    # Then the server is busy # Not all slots will be utilized.
    Then the server is idle
    And  all slots are idle
    Then all token probabilities are equal
    Examples:
      | n_slots | n_kv | n_predict | n_parallel |
      | 4       | 1024 | 1         | 1          |
      # FIXME: unified KV cache nondeterminism
      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
      # | 4       | 1024 | 1         | 4          |
      # | 4       | 1024 | 100       | 1          |
      # This test still fails even the above patches; the first token probabilities are already different.
      # | 4       | 1024 | 100       | 4          |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -23,6 +23,7 @@ from prometheus_client import parser
 def step_server_config(context, server_fqdn, server_port):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
    context.n_threads = None
    context.n_gpu_layer = None
    if 'PORT' in os.environ:
        context.server_port = int(os.environ['PORT'])
@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
    context.n_gpu_layer = ngl
@step('{n_threads:d} threads')
 def step_n_threads(context, n_threads):
    context.n_thread = n_threads
@step('{draft:d} as draft')
 def step_draft(context, draft):
    context.draft = draft
@ -193,7 +199,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
        case 'ready' | 'idle':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
-                                         timeout=10,
+                                         timeout=30,
                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
                                         slots_idle=context.n_slots,
                                         slots_processing=0,
@ -274,13 +280,22 @@ async def step_predictions_equal(context):
@step('all predictions are different')
@async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
    n_completions = await gather_tasks_results(context)
    assert n_completions >= 2, "need at least 2 completions"
    assert_all_predictions_different(context.tasks_result)
    context.tasks_result = []
@step('all token probabilities are equal')
@async_run_until_complete
 async def step_token_probabilities_equal(context):
    n_completions = await gather_tasks_results(context)
    assert n_completions >= 2, "need at least 2 completions"
    assert_all_token_probabilities_equal(context.tasks_result)
    context.tasks_result = []
@step('the completion is  truncated')
 def step_assert_completion_truncated(context):
    step_assert_completion_truncated(context, '')
@ -868,7 +883,8 @@ async def request_completion(prompt,
                                    "cache_prompt": cache_prompt,
                                    "id_slot": id_slot,
                                    "seed": seed if seed is not None else 42,
-                                    "temperature": temperature if temperature is not None else "0.8f",
+                                    "temperature": temperature if temperature is not None else 0.8,
                                    "n_probs": 2,
                                },
                                headers=headers,
                                timeout=3600) as response:
@ -887,6 +903,7 @@ async def oai_chat_completions(user_prompt,
                               base_path,
                               async_client,
                               debug=False,
                               temperature=None,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
@ -913,7 +930,8 @@ async def oai_chat_completions(user_prompt,
        "model": model,
        "max_tokens": n_predict,
        "stream": enable_streaming,
-        "seed": seed
+        "temperature": temperature if temperature is not None else 0.0,
        "seed": seed,
    }
    if response_format is not None:
        payload['response_format'] = response_format
@ -978,7 +996,8 @@ async def oai_chat_completions(user_prompt,
                max_tokens=n_predict,
                stream=enable_streaming,
                response_format=payload.get('response_format'),
-                seed=seed
+                seed=seed,
                temperature=payload['temperature']
            )
        except openai.error.AuthenticationError as e:
            if expect_api_error is not None and expect_api_error:
@ -1120,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
        assert content_i != content_j, "contents not different"
 def assert_all_token_probabilities_equal(completion_responses):
    n_predict = len(completion_responses[0]['completion_probabilities'])
    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
        for pos in range(n_predict):
            for i, response_i in enumerate(completion_responses):
                probs_i = response_i['completion_probabilities'][pos]['probs']
                print(f"pos {pos}, probs {i}: {probs_i}")
    for pos in range(n_predict):
        for i, response_i in enumerate(completion_responses):
            probs_i = response_i['completion_probabilities'][pos]['probs']
            for j, response_j in enumerate(completion_responses):
                if i == j:
                    continue
                probs_j = response_j['completion_probabilities'][pos]['probs']
            assert probs_i == probs_j, "contents not equal"
 async def gather_tasks_results(context):
    n_tasks = len(context.concurrent_tasks)
    if context.debug:
@ -1258,6 +1294,8 @@ def start_server_background(context):
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
        server_args.extend(['--ubatch-size', context.n_ubatch])
    if context.n_threads:
        server_args.extend(['--threads', context.threads])
    if context.n_gpu_layer:
        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
    if context.draft is not None:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["temperature"]       = json_value(body,   "temperature",       1.0);
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
    // Apply chat template to the list of messages
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -3,40 +3,390 @@
 #include <cmath>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
-int main(int argc, char ** argv) {
+#if defined(_WIN32)
-    if (argc < 3 || argv[1][0] == '-') {
+#define WIN32_LEAN_AND_MEAN
-        printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
+#include <windows.h>
 #include <shellapi.h>   // For CommandLineToArgvW
 #endif
 static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "usage: %s [options]\n\n", argv0);
    fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
    fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
    fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
    fprintf(stream, "to control the behavior of the tokenizer.\n\n");
    fprintf(stream, "    The possible options are:\n");
    fprintf(stream, "\n");
    fprintf(stream, "    -h, --help                           print this help and exit\n");
    fprintf(stream, "    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
    fprintf(stream, "    --ids                                if given, only print numerical token IDs, and not token strings.\n");
    fprintf(stream, "                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
    fprintf(stream, "    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
 }
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) text;
    (void) user_data;
 }
 static std::string read_prompt_from_file(const char * filepath, bool & success) {
    success = false;
    std::ifstream in(filepath, std::ios::binary);
    if (!in) {
        fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
        return std::string();
    }
    // do not assume the file is seekable (e.g. /dev/stdin)
    std::stringstream buffer;
    buffer << in.rdbuf();
    if (in.fail()) {
        fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
        return std::string();
    }
    success = true;
    return buffer.str();
 }
 //
 // Function: ingest_args(...) -> vector<string>
 //
 //  Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
 //  strings, as an STL vector<string>.
 //
 //  In particular, it handles character encoding shenanigans on Windows.
 //
 // Note: raw_argc and raw_argv are not actually read at all on Windows.
 //       On Windows we call GetCommandLineW to get the arguments in wchar_t
 //       format, ignoring the regular argc/argv arguments to main().
 //
 // TODO: potential opportunity to roll common stuff into common/console.cpp
 //       in relation to Windows wchar_t shenanigans.
 static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
    std::vector<std::string> argv;
    // Handle Windows, if given non-ASCII arguments.
    // We convert wchar_t arguments into UTF-8 char* on this platform.
    // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
    // without throwing tantrums.
 #if defined(_WIN32)
    int argc;
    const LPWSTR cmdline_wargv = GetCommandLineW();
    LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
    // silence unused arg warnings
    (void) raw_argc;
    (void) raw_argv;
    for (int i = 0; i < argc; ++i) {
        int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
        char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
        GGML_ASSERT(output_buf);
        WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
        output_buf[length_needed] = '\0';
        argv.push_back(output_buf);
        free(output_buf);
    }
    LocalFree((HLOCAL) wargv);
 #else
    int argc = raw_argc;
    for (int i = 0; i < argc; ++i) {
        argv.push_back(raw_argv[i]);
    }
 #endif
    GGML_ASSERT((unsigned int) argc == argv.size());
    return argv;
 }
 //
 // Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
 //
 // writes a string to standard output; taking into account that on Windows
 // to display correctly you have to use special handling. Works even if the
 // user has not set a unicode code page on a Windows cmd.exe.
 //
 // In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
 // a human-readable is written instead.
 //
 // On non-Windows systems, simply printfs() the string.
 static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
        invalid_utf8 = false;
 #if defined(_WIN32)
        // Are we in a console?
        HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
        DWORD dwMode = 0;
        // According to Microsoft docs:
        // "WriteConsole fails if it is used with a standard handle that is redirected to a file."
        // Also according to the docs, you can use GetConsoleMode to check for that.
        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
            printf("%s", str);
            return;
        }
        // MultiByteToWideChar reports an error if str is empty, don't report
        // them as invalid_utf8.
        if (*str == 0) {
            return;
        }
        int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
        if (length_needed == 0) {
            DWORD err = GetLastError();
            if (err == ERROR_NO_UNICODE_TRANSLATION) {
                invalid_utf8 = true;
                int len = strlen(str);
                printf("<");
                for (int i = 0; i < len; ++i) {
                    if (i > 0) {
                        printf(" ");
                    }
                    printf("%02x", (uint8_t) str[i]);
                }
                printf(">");
                return;
            }
            GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
        }
        LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
        GGML_ASSERT(wstr);
        MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
        WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
        free(wstr);
 #else
        // TODO: reporting invalid_utf8 would be useful on non-Windows too.
        // printf will silently just write bad unicode.
        printf("%s", str);
 #endif
 }
 int main(int raw_argc, char ** raw_argv) {
    const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
    const int argc = argv.size();
    if (argc <= 1) {
        print_usage_information(argv[0].c_str(), stderr);
        return 1;
    }
-    const char * model_path = argv[1];
+    //////
-    const char * prompt     = argv[2];
+    // Read out all the command line arguments.
    //////
-    const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
+    // variables where to put any arguments we see.
    bool printing_ids = false;
    bool no_bos = false;
    bool disable_logging = false;
    const char * model_path = NULL;
    const char * prompt_path = NULL;
    const char * prompt_arg = NULL;
    // track which arguments were explicitly given
    // used for sanity checking down the line
    bool model_path_set = false;
    bool prompt_path_set = false;
    bool prompt_set = false;
    bool stdin_set = false;
    int iarg = 1;
    for (; iarg < argc; ++iarg) {
        std::string arg{argv[iarg]};
        if (arg == "-h" || arg == "--help") {
            print_usage_information(argv[0].c_str(), stdout);
            return 0;
        }
        else if (arg == "--ids") {
            printing_ids = true;
        }
        else if (arg == "-m" || arg == "--model") {
            if (model_path_set) {
                fprintf(stderr, "Error: -m or --model specified multiple times.\n");
                return 1;
            }
            model_path = argv[++iarg].c_str();
            model_path_set = true;
        }
        else if (arg == "--no-bos") {
            no_bos = true;
        }
        else if (arg == "-p" || arg == "--prompt") {
            if (prompt_set) {
                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
                return 1;
            }
            prompt_arg = argv[++iarg].c_str();
            prompt_set = true;
        }
        else if (arg == "-f" || arg == "--file") {
            if (prompt_path_set) {
                fprintf(stderr, "Error: -f or --file specified multiple times.\n");
                return 1;
            }
            prompt_path = argv[++iarg].c_str();
            prompt_path_set = true;
        }
        else if (arg == "--stdin") {
            stdin_set = true;
        }
        else if (arg == "--log-disable") {
            disable_logging = true;
        }
        else {
            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
            return 1;
        }
    }
    //////
    // Sanity check the command line arguments.
    //////
    // Check that we have the required stuff set.
    if (model_path_set && model_path == NULL) {
        fprintf(stderr, "Error: --model requires an argument.\n");
        return 1;
    }
    if (!model_path_set) {
        fprintf(stderr, "Error: must specify --model.\n");
        return 1;
    }
    if (prompt_path_set && prompt_path == NULL) {
        fprintf(stderr, "Error: --file requires an argument.\n");
        return 1;
    }
    if (prompt_set && prompt_arg == NULL) {
        fprintf(stderr, "Error: --prompt requires an argument.\n");
        return 1;
    }
    const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
    if (prompts_set > 1) {
        fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
        return 1;
    }
    // Must have some prompt.
    if (prompts_set == 0) {
        fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
        return 1;
    }
    GGML_ASSERT(model_path);
    GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
    //////
    // Figure out where will the prompt come from.
    //////
    std::string prompt;
    if (prompt_path_set) {
        bool success = false;
        prompt = read_prompt_from_file(prompt_path, success);
        if (!success) {
            return 1;
        }
    } else if (prompt_set) {
        prompt = prompt_arg;
    } else {
        GGML_ASSERT(stdin_set);
        // we read stdin *after* loading model (early exit if model cannot
        // be loaded, which can be a nicer user experience)
    }
    //////
    // Start actually doing the tokenizing stuff.
    //////
 #ifdef LOG_DISABLE_LOGS
    disable_logging = true;
 #endif
    if (disable_logging) {
        llama_log_set(llama_log_callback_null, NULL);
    }
    llama_backend_init();
    llama_model_params model_params = llama_model_default_params();
    model_params.vocab_only = true;
    llama_model * model = llama_load_model_from_file(model_path, model_params);
    if (!model) {
        fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
        return 1;
    }
    llama_context_params ctx_params = llama_context_default_params();
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (!ctx) {
        fprintf(stderr, "Error: could not create context.\n");
        return 1;
    }
    // read entire prompt from stdin?
    if (stdin_set) {
        GGML_ASSERT(!prompt_path_set && !prompt_set);
        std::stringstream stdin_buffer;
        stdin_buffer << std::cin.rdbuf();
        if (std::cin.fail()) {
            fprintf(stderr, "Error: could not read the entire standard input.\n");
            return 1;
        }
        prompt = stdin_buffer.str();
    }
    const bool model_wants_add_bos = llama_should_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
    std::vector<llama_token> tokens;
    tokens = ::llama_tokenize(model, prompt, add_bos, true);
-    tokens = ::llama_tokenize(model, prompt, true, true);
+    if (printing_ids) {
        printf("[");
    }
    for (int i = 0; i < (int) tokens.size(); i++) {
        if (printing_ids) {
-            printf("%d\n", tokens[i]);
+            if (i > 0) {
                printf(", ");
            }
            printf("%d", tokens[i]);
        } else {
-            printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
+            bool invalid_utf8 = false;
            printf("%6d -> '", tokens[i]);
            write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
            if (invalid_utf8) {
                printf("' (utf-8 decode failure)\n");
            } else {
                printf("'\n");
            }
        }
    }
    if (printing_ids) {
        printf("]\n");
    }
    // silence valgrind
    llama_free(ctx);
    llama_free_model(model);
    return 0;
 }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;
-        return ggml_rope_custom(
+        return ggml_rope_ext(
-            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1714641030,
+        "lastModified": 1715865404,
-        "narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
+        "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
+        "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1714635257,
+        "lastModified": 1715961556,
-        "narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
+        "narHash": "sha256-+NpbZRCRisUHKQJZF3CT+xn14ZZQO+KjxIIanH3Pvn4=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
+        "rev": "4a6b83b05df1a8bd7d99095ec4b4d271f2956b64",
        "type": "github"
      },
      "original": {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
    tensor->buffer = buffer;
    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
    tensor->backend = tensor->view_src->backend;
    ggml_backend_buffer_init_tensor(buffer, tensor);
 }
--- a/ggml-common.h
+++ b/ggml-common.h
@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
 // QK = number of values after dequantization
 // QK_K = super-block size
 #ifdef GGML_QKK_64
 #define QK_K 64
 #define K_SCALE_SIZE 4
 #else
 #define QK_K 256
 #define K_SCALE_SIZE 12
 #endif // GGML_QKK_64
 #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
 // QR = QK / number of values before dequantization
@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
 #define QI4_NL (QK4_NL / (4*QR4_NL))
 #define QR4_NL 2
 #if QK_K == 64
 #define QI4_XS QI4_NL
 #define QR4_XS QR4_NL
 #else
 #define QI4_XS (QK_K / (4*QR4_XS))
 #define QR4_XS 8
 #endif
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    uint8_t hmask[QK_K/8]; // quants - high bit
    uint8_t qs[QK_K/4];    // quants - low 2 bits
    uint8_t scales[2];
    ggml_half d;           // super-block scale
 } block_q3_K;
 static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
 #else
 typedef struct {
    uint8_t hmask[QK_K/8]; // quants - high bit
    uint8_t qs[QK_K/4];    // quants - low 2 bits
@ -244,20 +225,11 @@ typedef struct {
    ggml_half d;           // super-block scale
 } block_q3_K;
 static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
 #endif
 // 4-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    ggml_half d[2];     // super-block scales/mins
    uint8_t scales[2];  // 4-bit block scales/mins
    uint8_t qs[QK_K/2]; // 4--bit quants
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
 #else
 typedef struct {
    union {
        struct {
@ -270,21 +242,11 @@ typedef struct {
    uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
 #endif
 // 5-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    ggml_half d;             // super-block scale
    int8_t  scales[QK_K/16]; // 8-bit block scales
    uint8_t qh[QK_K/8];      // quants, high bit
    uint8_t qs[QK_K/2];      // quants, low 4 bits
 } block_q5_K;
 static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
 #else
 typedef struct {
    union {
        struct {
@ -298,7 +260,6 @@ typedef struct {
    uint8_t qs[QK_K/2];           // quants, low 4 bits
 } block_q5_K;
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif
 // 6-bit quantization
 // weight is represented as x = a * q
@ -356,11 +317,7 @@ typedef struct {
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 // 3.4375 bpw
 #if QK_K == 64
 #define IQ3S_N_SCALE 2
 #else
 #define IQ3S_N_SCALE QK_K/64
 #endif
 typedef struct {
    ggml_half d;
    uint8_t qs[QK_K/4];
@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
 typedef struct {
    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
 #if QK_K == 64
    ggml_half d;
 #endif
    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
 } block_iq1_m;
 #if QK_K == 64
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
 #else
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
 #endif
 // Used by IQ1_M quants
 typedef union {
@ -406,9 +356,6 @@ typedef struct {
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
 #if QK_K == 64
 #define block_iq4_xs block_iq4_nl
 #else
 typedef struct {
    ggml_half d;
    uint16_t scales_h;
@ -416,7 +363,6 @@ typedef struct {
    uint8_t  qs[QK_K/2];
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 #endif
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -43,19 +43,59 @@
 #include <mutex>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string>
 #include <vector>
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
    GGML_UNUSED(level);
    GGML_UNUSED(user_data);
    fprintf(stderr, "%s", msg);
 }
 ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
 void * ggml_cuda_log_user_data = NULL;
 GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
    ggml_cuda_log_callback = log_callback;
    ggml_cuda_log_user_data = user_data;
 }
 #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
 #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
 #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 GGML_ATTRIBUTE_FORMAT(2, 3)
 static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
    if (ggml_cuda_log_callback != NULL) {
        va_list args;
        va_start(args, format);
        char buffer[128];
        int len = vsnprintf(buffer, 128, format, args);
        if (len < 128) {
            ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
        } else {
            std::vector<char> buffer2(len + 1);  // vsnprintf adds a null terminator
            va_end(args);
            va_start(args, format);
            vsnprintf(&buffer2[0], buffer2.size(), format, args);
            ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
        }
        va_end(args);
    }
 }
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
    cudaGetDevice(&id);
-    fprintf(stderr, "CUDA error: %s\n", msg);
+    GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
-    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    GGML_CUDA_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
-    fprintf(stderr, "  %s\n", stmt);
+    GGML_CUDA_LOG_ERROR("  %s\n", stmt);
    // abort with GGML_ASSERT to get a stack trace
    GGML_ASSERT(!"CUDA error");
 }
@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    cudaError_t err = cudaGetDeviceCount(&info.device_count);
    if (err != cudaSuccess) {
-        fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
+        GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
        return info;
    }
@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
    int64_t total_vram = 0;
 #if defined(GGML_CUDA_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
 #else
-    fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+    GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
 #endif
 #if defined(CUDA_USE_TENSOR_CORES)
-    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+    GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
 #else
-    fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+    GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
-    fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
+    GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;
@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
        cudaDeviceProp prop;
        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-        fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_CUDA_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
        info.default_tensor_split[id] = total_vram;
        total_vram += prop.totalGlobalMem;
@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
-        fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
+        GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
-                (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
+                           (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
 #endif
        return ptr;
    }
@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                return;
            }
        }
-        fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+        GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
        ggml_cuda_set_device(device);
        CUDA_CHECK(cudaFree(ptr));
        pool_size -= size;
@ -499,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
    void * dev_ptr;
    cudaError_t err = cudaMalloc(&dev_ptr, size);
    if (err != cudaSuccess) {
-        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
+        // clear the error
        cudaGetLastError();
        GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
        return nullptr;
    }
@ -1002,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();
-        fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
-            size/1024.0/1024.0, cudaGetErrorString(err));
+                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return nullptr;
    }
@ -2246,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            break;
        case GGML_OP_MUL_MAT:
            if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
-                fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
+                GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
                return false;
            } else {
                ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@ -2300,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
-        fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
+        GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
        CUDA_CHECK(err);
    }
@ -2476,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
 #ifndef NDEBUG
-            fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+            GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
 #endif
        }
    }
@ -2523,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
+                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
 #endif
            }
            if (node->op == GGML_OP_MUL_MAT_ID) {
                use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
 #endif
            }
@ -2539,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
                // Changes in batch size or context size can cause changes to the grid size of some kernels.
                use_cuda_graph = false;
 #ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
            }
@ -2558,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        }
        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
            cuda_ctx->cuda_graph->number_consecutive_updates++;
        } else {
            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@ -2567,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
 #ifndef NDEBUG
-            fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+            GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
        }
    }
@ -2605,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
                bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
                if (!ok) {
-                    fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+                    GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
                }
                GGML_ASSERT(ok);
            }
@ -2624,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
                use_cuda_graph = false;
                cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
 #ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
+                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
 #endif
            } else {
                graph_evaluated_or_captured = true; // CUDA graph has been captured
@ -2691,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
        if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
+            GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
 #endif
            // The pre-existing graph exec cannot be updated due to violated constraints
            // so instead clear error and re-instantiate
@ -2713,6 +2755,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 }
 GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
@ -2840,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_FLASH_ATTN_EXT:
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
 #else
            if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
                return true;
            }
            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        default:
            return false;
    }
@ -2939,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
 GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
    if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
+        GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
        return nullptr;
    }
    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
    if (ctx == nullptr) {
-        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+        GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
        return nullptr;
    }
@ -2989,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
        // clear the error
        cudaGetLastError();
-        fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
-                size/1024.0/1024.0, cudaGetErrorString(err));
+                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return false;
    }
    return true;
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
 GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -315,12 +315,30 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #endif
    return c;
 }
 #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 // __shfl_xor() for half2 was added in ROCm 5.6
 static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
    typedef union half2_b32 {
        half2 val;
        int   b32;
    } half2_b32_t;
    half2_b32_t tmp;
    tmp.val = var;
    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
    return tmp.val;
 }
 #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 #endif // defined(GGML_USE_HIPBLAS)
 #define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 static bool fast_fp16_available(const int cc) {
    return cc >= CC_PASCAL && cc != 610;
 }
 static bool fp16_mma_available(const int cc) {
    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
 }
@ -459,6 +477,17 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 static __device__ __forceinline__ float get_alibi_slope(
    const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
 ) {
    if (max_bias <= 0.0f) {
        return 1.0f;
    }
    const float base = h < n_head_log2 ? m0 : m1;
    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
    return powf(base, exph);
 }
 //////////////////////
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
    const block_q2_K * x = (const block_q2_K *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t n   = tid/32;
    const int64_t l   = tid - 32*n;
    const int64_t is  = 8*n + l/16;
@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 #else
    const int64_t is = tid/16;  // 0 or 1
    const int64_t il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    dst_t * y = yy + i*QK_K + 16*is + il;
    float dall = __low2half(x[i].dm);
    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 #endif
 }
 template<typename dst_t>
@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;
 #if QK_K == 256
    const int64_t r = threadIdx.x/4;
    const int64_t tid = r/2;
    const int64_t is0 = r%2;
@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const uint8_t * hm = x[i].hmask;
    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 #else
    const int64_t tid = threadIdx.x;
    const int64_t is  = tid/16;  // 0 or 1
    const int64_t il  = tid%16;  // 0...15
    const int64_t im  = il/8;    // 0...1
    const int64_t in  = il%8;    // 0...7
    dst_t * y = yy + i*QK_K + 16*is + il;
    const uint8_t q = x[i].qs[il] >> (2*is);
    const uint8_t h = x[i].hmask[in] >> (2*is + im);
    const float   d = (float)x[i].d;
    if (is == 0) {
        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    } else {
        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    }
 #endif
 }
 #if QK_K == 256
 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
    if (j < 4) {
        d = q[j] & 63; m = q[j + 4] & 63;
@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
 }
 #endif
 template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
 #if QK_K == 256
    // assume 32 threads
    const int64_t tid = threadIdx.x;
    const int64_t il  = tid/8;
@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
        y[l + 0] = d1 * (q[l] & 0xF) - m1;
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
 #else
    const int64_t tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    dst_t * y = yy + i*QK_K;
    const float d = (float)x[i].dm[0];
    const float m = (float)x[i].dm[1];
    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
 #endif
 }
 template<typename dst_t>
@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
 #if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int64_t tid = threadIdx.x;
    const int64_t il  = tid/16;   // il is in 0...3
@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    hm <<= 1;
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 #else
    const int64_t tid = threadIdx.x;
    const uint8_t q = x[i].qs[tid];
    const int64_t im = tid/8;  // 0...3
    const int64_t in = tid%8;  // 0...7
    const int64_t is = tid/16; // 0 or 1
    const uint8_t h = x[i].qh[in] >> im;
    const float d = x[i].d;
    dst_t * y = yy + i*QK_K + tid;
    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
 #endif
 }
 template<typename dst_t>
@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    const block_q6_K * x = (const block_q6_K *) vx;
    const int64_t i = blockIdx.x;
 #if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int64_t tid = threadIdx.x;
@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
 #else
    // assume 32 threads
    const int64_t tid = threadIdx.x;
    const int64_t ip  = tid/16;         // 0 or 1
    const int64_t il  = tid - 16*ip;    // 0...15
    dst_t * y = yy + i*QK_K + 16*ip + il;
    const float d = x[i].d;
    const uint8_t   ql = x[i].ql[16*ip + il];
    const uint8_t   qh = x[i].qh[il] >> (2*ip);
    const int8_t  * sc = x[i].scales;
    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
 #endif
 }
 template<typename dst_t>
@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const block_iq2_xs * x = (const block_iq2_xs *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const block_iq2_s * x = (const block_iq2_s *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
    const block_iq3_s * x = (const block_iq3_s *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    const block_iq1_s * x = (const block_iq1_s  *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    const block_iq1_m * x = (const block_iq1_m  *) vx;
    const int64_t tid = threadIdx.x;
 #if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
 #else
    NO_DEVICE_CODE;
 #endif
 }
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
 }
 #if QK_K != 64
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const int64_t i   = blockIdx.x;
@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
 }
 #endif
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
 template<typename dst_t>
 static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
 #else
    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
 #endif
 }
 template<typename dst_t>
 static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
 #else
    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
 #endif
 }
 template<typename dst_t>
@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
 template<typename dst_t>
 static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
 #else
    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
 #endif
 }
 template<typename dst_t>
 static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
 #else
    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
 #endif
 }
 template<typename dst_t>
@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
 template<typename dst_t>
 static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
 #if QK_K == 64
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 #else
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 #endif
 }
 template <typename src_t, typename dst_t>
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
    float tmp = 0; // partial sum for thread in warp
 #if QK_K == 256
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
        tmp += dall * sum1 - dmin * sum2;
    }
 #else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;
    uint32_t uaux[2];
    const uint8_t * d = (const uint8_t *)uaux;
    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint32_t * s = (const uint32_t *)x[i].scales;
        uaux[0] = s[0] & 0x0f0f0f0f;
        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
        const float2 dall = __half22float2(x[i].dm);
        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t ql = q[l];
            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
                  + y[l+16] * d[1] * ((ql >> 2) & 3)
                  + y[l+32] * d[2] * ((ql >> 4) & 3)
                  + y[l+48] * d[3] * ((ql >> 6) & 3);
            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
        }
        tmp += dall.x * sum1 - dall.y * sum2;
    }
 #endif
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
    float tmp = 0; // partial sum for thread in warp
 #if QK_K == 256
    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;
@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
        tmp += d * sum;
    }
 #else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
    const int in = offset/8;                                 // 0 or 1
    const int im = offset%8;                                 // 0...7
    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint8_t * s = x[i].scales;
        const float dall = (float)x[i].d;
        float sum = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t hl = x[i].hmask[im+l] >> in;
            const uint8_t ql = q[l];
            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
        }
        tmp += sum;
    }
 #endif
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
    const block_q4_K * x = (const block_q4_K *)vx + ib0;
 #if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
 #endif
    }
 #else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
    const int step = tid * K_QUANTS_PER_ITERATION;
    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;
    float tmp = 0;
    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const float   * y = yy + i*QK_K + step;
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
        const float d = (float)x[i].dm[0];
        const float m = (float)x[i].dm[1];
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
        }
        tmp += sum;
    }
 #endif
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
    float tmp = 0; // partial sum for thread in warp
 #if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
    }
 #else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
    const int step = tid * K_QUANTS_PER_ITERATION;
    const int im = step/8;
    const int in = step%8;
    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const int8_t  * s = x[i].scales;
        const float   * y = yy + i*QK_K + step;
        const float     d = x[i].d;
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            const uint8_t h = x[i].qh[in+j] >> im;
            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
        }
        tmp += sum;
    }
 #endif
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
    const block_q6_K * x = (const block_q6_K *)vx + ib0;
 #if QK_K == 256
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
    }
 #else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
    const int step = tid * K_QUANTS_PER_ITERATION;
    float tmp = 0; // partial sum for thread in warp
    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const float   * y  = yy + i * QK_K + step;
        const uint8_t * ql = x[i].ql + step;
        const uint8_t * qh = x[i].qh + step;
        const int8_t  * s  = x[i].scales;
        const float d = x[i+0].d;
        float sum = 0;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
        }
        tmp += sum;
    }
 #endif
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
--- a/ggml-cuda/fattn-common.cuh
+++ b/ggml-cuda/fattn-common.cuh
@ -0,0 +1,162 @@
 #include "common.cuh"
 #include <cstdint>
 #define FATTN_KQ_STRIDE       256
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
 typedef void (* fattn_kernel_t)(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3);
 template<int D, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
        float * __restrict__ dst) {
    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
    dst       +=                 D * gridDim.y*blockIdx.x;
    const int tid = threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ float2 meta[parallel_blocks];
    if (tid < 2*parallel_blocks) {
        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
    }
    __syncthreads();
    float kqmax = meta[0].x;
 #pragma unroll
    for (int l = 1; l < parallel_blocks; ++l) {
        kqmax = max(kqmax, meta[l].x);
    }
    float VKQ_numerator   = 0.0f;
    float VKQ_denominator = 0.0f;
 #pragma unroll
    for (int l = 0; l < parallel_blocks; ++l) {
        const float diff = meta[l].x - kqmax;
        const float KQ_max_scale = expf(diff);
        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
        VKQ_denominator += KQ_max_scale * meta[l].y;
    }
    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
 }
 template <int D, int parallel_blocks>
 void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) {
    const ggml_tensor * Q = dst->src[0];
    const ggml_tensor * K = dst->src[1];
    const ggml_tensor * V = dst->src[2];
    const ggml_tensor * mask = dst->src[3];
    ggml_tensor * KQV = dst;
    GGML_ASSERT(Q->type == GGML_TYPE_F32);
    GGML_ASSERT(K->type == GGML_TYPE_F16);
    GGML_ASSERT(V->type == GGML_TYPE_F16);
    GGML_ASSERT(KQV->type == GGML_TYPE_F32);
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
    if (parallel_blocks > 1) {
        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
    }
    const dim3 block_dim(WARP_SIZE, nwarps, 1);
    const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
    const int  shmem = 0;
    float scale    = 1.0f;
    float max_bias = 0.0f;
    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
    const uint32_t n_head      = Q->ne[2];
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    fattn_kernel<<<blocks_num, block_dim, shmem, main_stream>>>(
        (const char *) Q->data,
        (const char *) K->data,
        (const char *) V->data,
        mask ? ((const char *) mask->data) : nullptr,
        (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2,
        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
        mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
        Q->nb[1], Q->nb[2], Q->nb[3],
        K->nb[1], K->nb[2], K->nb[3],
        KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
    );
    CUDA_CHECK(cudaGetLastError());
    if ((parallel_blocks) == 1) {
        return;
    }
    const dim3 block_dim_combine(D, 1, 1);
    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
    const int  shmem_combine = 0;
    flash_attn_combine_results<D, parallel_blocks>
        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
    CUDA_CHECK(cudaGetLastError());
 }
--- a/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml-cuda/fattn-tile-f16.cu
@ -0,0 +1,316 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f16.cuh"
 #define FATTN_KQ_STRIDE_TILE_F16 64
 template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    __shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
    half2 * KQ2 = (half2 *) KQ;
    __shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
    half kqmax[ncols/nwarps];
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        kqmax[j0/nwarps] = -HALF_MAX_HALF;
    }
    half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
    half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
    // Convert Q to half2 and store in registers:
    __shared__ half2 Q_h2[ncols][D/2];
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
            Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
        }
    }
    __syncthreads();
    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        half kqmax_new[ncols/nwarps];
 #pragma unroll
        for (int j = 0; j < ncols/nwarps; ++j) {
            kqmax_new[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
            }
        }
        __syncthreads();
        half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
 #pragma unroll
        for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
            half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
            half2 Q_k[ncols/nwarps];
 #pragma unroll
            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
                const int i_KQ = i_KQ_0 + threadIdx.x;
                K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
            }
 #pragma unroll
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
                Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
            }
 #pragma unroll
            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
 #pragma unroll
                for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                    sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
                }
            }
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
            const int i_KQ = i_KQ_0 + threadIdx.x;
 #pragma unroll
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
                half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
                sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
                KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
            const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
            kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
 #pragma unroll
            for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
                const half2 val = h2exp(diff);
                kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
                KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
            }
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
            const int k = k0 + threadIdx.y;
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
            }
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
            half2  V_k[(D/2)/WARP_SIZE][2];
            half2 KQ_k[ncols/nwarps];
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
                V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
            }
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += nwarps) {
                const int j = j0 + threadIdx.y;
                KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
            }
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
 #pragma unroll
                for (int j0 = 0; j0 < ncols; j0 += nwarps) {
                    VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
                    VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
                }
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
        const int j_VKQ = j_VKQ_0 + threadIdx.y;
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
        half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
        kqsum_j = warp_reduce_sum(kqsum_j);
 #pragma unroll
        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
            const int i0 = i00 + 2*threadIdx.x;
            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
            if (parallel_blocks == 1) {
                dst_val /= __half2half2(kqsum_j);
            }
            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] =  __low2float(dst_val);
            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
        }
        if (parallel_blocks != 1 && threadIdx.x == 0) {
            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
        constexpr int parallel_blocks = 4;
        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 32) {
        constexpr int cols_per_block = 32;
        constexpr int parallel_blocks = 4;
        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block = 32;
    constexpr int parallel_blocks = 1;
    launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-tile-f16.cuh
+++ b/ggml-cuda/fattn-tile-f16.cuh
@ -0,0 +1,3 @@
 #include "common.cuh"
 void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml-cuda/fattn-tile-f32.cu
@ -0,0 +1,309 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f32.cuh"
 #define FATTN_KQ_STRIDE_TILE_F32 32
 template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    __shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
    __shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
    float2 * KV_tmp2 = (float2 *) KV_tmp;
    float kqmax[ncols/nwarps];
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        kqmax[j0/nwarps] = -FLT_MAX/2.0f;
    }
    float kqsum[ncols/nwarps] = {0.0f};
    float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
    // Convert Q to half2 and store in registers:
    __shared__ float Q_f[ncols][D];
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j = j0 + threadIdx.y;
 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
            float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
            Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
            Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
        }
    }
    __syncthreads();
    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        float kqmax_new[ncols/nwarps];
 #pragma unroll
        for (int j = 0; j < ncols/nwarps; ++j) {
            kqmax_new[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
                const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
                KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] =  __low2float(tmp);
                KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
            }
        }
        __syncthreads();
        float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
 #pragma unroll
        for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
            float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
            float Q_k[ncols/nwarps];
 #pragma unroll
            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
                const int i_KQ = i_KQ_0 + threadIdx.x;
                K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
            }
 #pragma unroll
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
                Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
            }
 #pragma unroll
            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
 #pragma unroll
                for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                    sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
                }
            }
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
            const int i_KQ = i_KQ_0 + threadIdx.x;
 #pragma unroll
            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
                const int j_KQ = j_KQ_0 + threadIdx.y;
                sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
                KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
            const int j = j0 + threadIdx.y;
            kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
            const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
            kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
            float kqsum_add = 0.0f;
 #pragma unroll
            for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
                const float val = expf(diff);
                kqsum_add += val;
                KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
            }
            kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
                VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
            const int k = k0 + threadIdx.y;
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                KV_tmp2[k*(D/2) + i].x =  __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
                KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
            }
        }
        __syncthreads();
 #pragma unroll
        for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
            float2 V_k[(D/2)/WARP_SIZE];
            float  KQ_k[ncols/nwarps];
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
                const int i = i0 + threadIdx.x;
                V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
            }
 #pragma unroll
            for (int j0 = 0; j0 < ncols; j0 += nwarps) {
                const int j = j0 + threadIdx.y;
                KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
            }
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
 #pragma unroll
                for (int j0 = 0; j0 < ncols; j0 += nwarps) {
                    VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
                    VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
                }
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
        const int j_VKQ = j_VKQ_0 + threadIdx.y;
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
        float kqsum_j = kqsum[j_VKQ_0/nwarps];
        kqsum_j = warp_reduce_sum(kqsum_j);
 #pragma unroll
        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
            const int i0 = i00 + 2*threadIdx.x;
            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
            if (parallel_blocks == 1) {
                dst_val.x /= kqsum_j;
                dst_val.y /= kqsum_j;
            }
            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
        }
        if (parallel_blocks != 1 && threadIdx.x == 0) {
            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
        constexpr int parallel_blocks = 4;
        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 32) {
        constexpr int cols_per_block = 32;
        constexpr int parallel_blocks = 4;
        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block = 32;
    constexpr int parallel_blocks = 1;
    launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-tile-f32.cuh
+++ b/ggml-cuda/fattn-tile-f32.cuh
@ -0,0 +1,3 @@
 #include "common.cuh"
 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn-vec-f16.cu
+++ b/ggml-cuda/fattn-vec-f16.cu
@ -0,0 +1,330 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-vec-f16.cuh"
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ half KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
    half2 * KQ2 = (half2 *) KQ;
    half kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
    }
    half kqsum[ncols] = {0.0f};
    __shared__ half kqmax_shared[ncols][WARP_SIZE];
    __shared__ half kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
        }
    }
    half2 VKQ[ncols] = {{0.0f, 0.0f}};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
        half kqmax_new = kqmax[0];
        half kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
            half2 sum2[ncols] = {{0.0f, 0.0f}};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
 #pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
                }
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                sum2[j] = warp_reduce_sum(sum2[j]);
                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
                } else {
                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= __half2half2(KQ_max_scale);
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < D; k0 += 2) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
                break;
            }
            half2 V_k;
            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_tensor * KQV = dst;
    ggml_tensor * Q   = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    constexpr int cols_per_block  = 1;
    constexpr int parallel_blocks = 4;
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 256: {
            constexpr int      D = 256;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default:
            GGML_ASSERT(false);
            break;
    }
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    if (Q->ne[1] == 1) {
        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml-cuda/fattn-vec-f16.cuh
@ -0,0 +1,5 @@
 #include "common.cuh"
 void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn-vec-f32.cu
+++ b/ggml-cuda/fattn-vec-f32.cu
@ -0,0 +1,279 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-vec-f32.cuh"
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ float KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -FLT_MAX/2.0f;
    }
    float kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -FLT_MAX/2.0f;
    }
    float kqsum[ncols] = {0.0f};
    __shared__ float kqmax_shared[ncols][WARP_SIZE];
    __shared__ float kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
    float2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            Q_h2[j][i0/WARP_SIZE]    = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
            Q_h2[j][i0/WARP_SIZE].x *= scale;
            Q_h2[j][i0/WARP_SIZE].y *= scale;
        }
    }
    float VKQ[ncols] = {0.0f};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
            float sum[ncols] = {0.0f};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
 #pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum[j] +=  __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
                    sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
                }
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                sum[j] = warp_reduce_sum(sum[j]);
                sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum[j];
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const float val = expf(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= KQ_max_scale;
        }
        __syncthreads();
 #pragma unroll
        for (int k = 0; k < D; ++k) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
                break;
            }
            const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_ki*KQ[j*D + k];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
            break;
        }
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        float dst_val = VKQ[j_VKQ];
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
    }
 }
 template <int cols_per_block, int parallel_blocks>
 void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
        case  64: {
            constexpr int      D = 64;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = D/WARP_SIZE;
            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
        } break;
    }
 }
 void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    if (Q->ne[1] == 1) {
        constexpr int cols_per_block  = 1;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] == 2) {
        constexpr int cols_per_block  = 2;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 4) {
        constexpr int cols_per_block  = 4;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    if (Q->ne[1] <= 8) {
        constexpr int cols_per_block  = 8;
        constexpr int parallel_blocks = 4;
        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
        return;
    }
    constexpr int cols_per_block  = 8;
    constexpr int parallel_blocks = 1;
    launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
 }
--- a/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml-cuda/fattn-vec-f32.cuh
@ -0,0 +1,3 @@
 #include "common.cuh"
 void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@ -1,4 +1,9 @@
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f16.cuh"
 #include "fattn-tile-f32.cuh"
 #include "fattn-vec-f16.cuh"
 #include "fattn-vec-f32.cuh"
 #include "fattn.cuh"
 #include <cstdint>
@ -7,251 +12,6 @@
 #include <mma.h>
 #endif
 #define FATTN_KQ_STRIDE       256
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
 template<int D, int ncols, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
        const float max_bias,
        const float m0,
        const float m1,
        const uint32_t n_head_log2,
        const int ne00,
        const int ne01,
        const int ne02,
        const int ne03,
        const int ne10,
        const int ne11,
        const int ne12,
        const int ne13,
        const int ne31,
        const int nb31,
        const int nb01,
        const int nb02,
        const int nb03,
        const int nb11,
        const int nb12,
        const int nb13,
        const int ne0,
        const int ne1,
        const int ne2,
        const int ne3) {
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;
    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);
    half slopeh = __float2half(1.0f);
    // ALiBi
    if (max_bias > 0.0f) {
        const int h = blockIdx.y;
        const float base = h < n_head_log2 ? m0 : m1;
        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slopeh = __float2half(powf(base, exph));
    }
    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ half KQ[ncols*D];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
    half2 * KQ2 = (half2 *) KQ;
    half kqmax[ncols];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqmax[j] = -HALF_MAX_HALF;
    }
    half kqsum[ncols] = {0.0f};
    __shared__ half kqmax_shared[ncols][WARP_SIZE];
    __shared__ half kqsum_shared[ncols][WARP_SIZE];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        if (threadIdx.y == 0) {
            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
    __syncthreads();
    // Convert Q to half2 and store in registers:
    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
 #pragma unroll
        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
            const int i = i0 + threadIdx.x;
            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
        }
    }
    half2 VKQ[ncols] = {{0.0f, 0.0f}};
    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
        half kqmax_new = kqmax[0];
        half kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            kqmax_new_arr[j] = kqmax[j];
        }
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
                break;
            }
            half2 sum2[ncols] = {{0.0f, 0.0f}};
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
 #pragma unroll
                for (int j = 0; j < ncols; ++j) {
                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
                }
            }
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                sum2[j] = warp_reduce_sum(sum2[j]);
                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
                } else {
                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
                }
                if (threadIdx.x == 0) {
                    KQ[j*D + i_KQ] = sum;
                }
            }
        }
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
        }
        __syncthreads();
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
            kqmax[j] = kqmax_new_j;
            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
            kqsum[j] = kqsum[j]*KQ_max_scale + val;
            KQ[j*D + tid] = val;
            VKQ[j] *= __half2half2(KQ_max_scale);
        }
        __syncthreads();
 #pragma unroll
        for (int k0 = 0; k0 < D; k0 += 2) {
            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
                break;
            }
            half2 V_k;
            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
            }
        }
        __syncthreads();
    }
 #pragma unroll
    for (int j = 0; j < ncols; ++j) {
        kqsum[j] = warp_reduce_sum(kqsum[j]);
        if (threadIdx.x == 0) {
            kqsum_shared[j][threadIdx.y] = kqsum[j];
        }
    }
    __syncthreads();
 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
        if (parallel_blocks == 1) {
            dst_val /= kqsum[j_VKQ];
        }
        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
    }
    if (parallel_blocks != 1 && tid != 0) {
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
        }
    }
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -325,19 +85,9 @@ static __global__ void flash_attn_ext_f16(
    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);
-    half  slopeh = __float2half(1.0f);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
-    half2 slope2 = make_half2(1.0f, 1.0f);
+    const half  slopeh = __float2half(slopef);
-
+    const half2 slope2 = make_half2(slopef, slopef);
    // ALiBi
    if (max_bias > 0.0f) {
        const int h = blockIdx.y;
        const float base = h < n_head_log2 ? m0 : m1;
        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slopeh = __float2half(powf(base, exph));
        slope2 = make_half2(slopeh, slopeh);
    }
    frag_b Q_b[D/16][ncols/frag_n];
@ -655,54 +405,6 @@ static __global__ void flash_attn_ext_f16(
 #endif // FP16_MMA_AVAILABLE
 }
 template<int D, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
        float * __restrict__ dst) {
 #if FP16_AVAILABLE
    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
    dst       +=                 D * gridDim.y*blockIdx.x;
    const int tid = threadIdx.x;
    __builtin_assume(tid < D);
    __shared__ float2 meta[parallel_blocks];
    if (tid < 2*parallel_blocks) {
        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
    }
    __syncthreads();
    float kqmax = meta[0].x;
 #pragma unroll
    for (int l = 1; l < parallel_blocks; ++l) {
        kqmax = max(kqmax, meta[l].x);
    }
    float VKQ_numerator   = 0.0f;
    float VKQ_denominator = 0.0f;
 #pragma unroll
    for (int l = 0; l < parallel_blocks; ++l) {
        const float diff = meta[l].x - kqmax;
        const float KQ_max_scale = expf(diff);
        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
        VKQ_denominator += KQ_max_scale * meta[l].y;
    }
    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
 }
 constexpr int get_max_power_of_2(int x) {
    return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
 }
@ -727,280 +429,94 @@ static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
-template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
+template <int D, int cols_per_block, int nwarps, typename KQ_acc_t>
-        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
+void launch_fattn_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-        ggml_cuda_pool & pool, cudaStream_t main_stream
+    const ggml_tensor * Q = dst->src[0];
 ) {
    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
-    if (parallel_blocks > 1) {
+    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
    }
    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
    const     int  shmem = 0;
    float scale    = 1.0f;
    float max_bias = 0.0f;
    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
    const uint32_t n_head      = Q->ne[2];
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
        <<<blocks_num, block_dim, shmem, main_stream>>> (
                (const char *) Q->data,
                (const char *) K->data,
                (const char *) V->data,
                mask ? ((const char *) mask->data) : nullptr,
                parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
                scale, max_bias, m0, m1, n_head_log2,
                Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
                K->ne[0], K->ne[1], K->ne[2], K->ne[3],
                mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
                Q->nb[1], Q->nb[2], Q->nb[3],
                K->nb[1], K->nb[2], K->nb[3],
                KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
                );
    CUDA_CHECK(cudaGetLastError());
    if (parallel_blocks == 1) {
        return;
    }
    const dim3 block_dim_combine(D, 1, 1);
    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
    const int  shmem_combine = 0;
    flash_attn_combine_results<D, parallel_blocks>
        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
    CUDA_CHECK(cudaGetLastError());
 }
 template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        ggml_cuda_pool & pool, cudaStream_t main_stream
 ) {
    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
    if (parallel_blocks > 1) {
        dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
        dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
    }
    constexpr int  frag_m = (cols_per_block) == 8 && (D) % 32 == 0 ? 32 : 16;
    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
    const     dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
    const     int  shmem = 0;
    float scale    = 1.0f;
    float max_bias = 0.0f;
    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
    const uint32_t n_head      = Q->ne[2];
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
        <<<blocks_num, block_dim, shmem, main_stream>>> (
                (const char *) Q->data,
                (const char *) K->data,
                (const char *) V->data,
                mask ? ((const char *) mask->data) : nullptr,
                (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
                scale, max_bias, m0, m1, n_head_log2,
                Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
                K->ne[0], K->ne[1], K->ne[2], K->ne[3],
                mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
                Q->nb[1], Q->nb[2], Q->nb[3],
                K->nb[1], K->nb[2], K->nb[3],
                KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
                );
    CUDA_CHECK(cudaGetLastError());
    if ((parallel_blocks) == 1) {
        return;
    }
    const dim3 block_dim_combine(D, 1, 1);
    const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
    const int  shmem_combine = 0;
    flash_attn_combine_results<D, parallel_blocks>
        <<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
        (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
    CUDA_CHECK(cudaGetLastError());
 }
 template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_fattn_f16(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        const int nsm, ggml_cuda_pool & pool, cudaStream_t main_stream
 ) {
    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    if (4*blocks_num_pb1 < 2*nsm) {
-        launch_fattn_f16_impl<D, cols_per_block, nwarps, 4, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
+        constexpr int parallel_blocks = 4;
        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        return;
    }
    if (2*blocks_num_pb1 < 2*nsm) {
-        launch_fattn_f16_impl<D, cols_per_block, nwarps, 2, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
+        constexpr int parallel_blocks = 2;
        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
        launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
        return;
    }
-    launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
+    constexpr int parallel_blocks = 1;
    fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
    launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
 }
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * KQV = dst;
-    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * Q   = dst->src[0];
    const ggml_tensor * V = dst->src[2];
    const ggml_tensor * mask = dst->src[3];
    ggml_tensor * KQV = dst;
    GGML_ASSERT(Q->type == GGML_TYPE_F32);
    GGML_ASSERT(K->type == GGML_TYPE_F16);
    GGML_ASSERT(V->type == GGML_TYPE_F16);
    GGML_ASSERT(KQV->type == GGML_TYPE_F32);
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
    ggml_cuda_set_device(ctx.device);
-
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const int cc  = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    const int32_t precision = KQV->op_params[2];
-    if (!fp16_mma_available(cc)) {
+    // On AMD the tile kernels perform poorly, use the vec kernel instead:
-        GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+    if (cc >= CC_OFFSET_AMD) {
-        GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+        if (precision == GGML_PREC_DEFAULT) {
-
+            ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
-        if (Q->ne[1] == 1) {
+        } else {
-            constexpr int cols_per_block = 1;
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        if (Q->ne[1] == 2) {
            constexpr int cols_per_block = 2;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        if (Q->ne[1] <= 4) {
            constexpr int cols_per_block = 4;
            constexpr int parallel_blocks = 4;
            switch (Q->ne[0]) {
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        return;
    }
    if (!fast_fp16_available(cc)) {
        if (Q->ne[1] <= 8) {
-            constexpr int cols_per_block = 8;
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
-            constexpr int parallel_blocks = 4;
+        } else {
-            switch (Q->ne[0]) {
+            ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
                case 64:
                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                case 128:
                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                    break;
                default:
                    GGML_ASSERT(false);
                    break;
            }
            return;
        }
        return;
    }
-        constexpr int cols_per_block = 8;
+    if (!fp16_mma_available(cc)) {
-        constexpr int parallel_blocks = 1;
+        if (Q->ne[1] <= 8) {
-        switch (Q->ne[0]) {
+            ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
-            case 64:
+        } else {
-                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+            ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
                break;
            case 128:
                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
            return;
        }
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
            constexpr int nwarps         =  4;
            switch (Q->ne[0]) {
                case 64:
-                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 80:
-                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 96:
-                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 112:
-                    launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 128:
-                    launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 256:
-                    launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                default:
                    GGML_ASSERT(false);
@ -1011,22 +527,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
            constexpr int nwarps         =  4;
            switch (Q->ne[0]) {
                case 64:
-                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 80:
-                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 96:
-                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 112:
-                    launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                case 128:
-                    launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                    launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
                    break;
                // case 256:
-                //     launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                //     launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
                //     break;
                default:
                    GGML_ASSERT(false);
@ -1037,22 +553,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }
    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        constexpr int cols_per_block = 1;
+        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        constexpr int parallel_blocks = 4;
        switch (Q->ne[0]) {
            case 64:
                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 128:
                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 256:
                launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        return;
    }
@ -1061,16 +562,16 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
        constexpr int nwarps         = 4;
        switch (Q->ne[0]) {
            case 64:
-                launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 96:
-                launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 128:
-                launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 256:
-                launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
@ -1084,22 +585,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
        constexpr int nwarps         =  4;
        switch (Q->ne[0]) {
            case 64:
-                launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 80:
-                launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 96:
-                launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 112:
-                launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 128:
-                launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
                break;
            case 256:
-                launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+                launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
                break;
            default:
                GGML_ASSERT(false);
@ -1112,22 +613,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    constexpr int nwarps         =  4;
    switch (Q->ne[0]) {
        case 64:
-            launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 80:
-            launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 96:
-            launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 112:
-            launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 128:
-            launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
            break;
        case 256:
-            launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
+            launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
            break;
        default:
            GGML_ASSERT(false);
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@ -58,10 +58,10 @@ static __global__ void rope(
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
-template<typename T, bool has_pos>
+template<typename T, bool has_pos, bool has_freq_facs>
 static __global__ void rope_neox(
    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@ -88,7 +88,9 @@ static __global__ void rope_neox(
    float cur_rot = inv_ndims * ic - ib;
    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
+    const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@ -164,7 +166,7 @@ static void rope_cuda(
 template<typename T>
 static void rope_neox_cuda(
    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
@ -175,15 +177,29 @@ static void rope_neox_cuda(
    const float inv_ndims = -1.0f / n_dims;
    if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
+        if (freq_factors == nullptr) {
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
-            theta_scale, inv_ndims
+                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-        );
+                theta_scale, inv_ndims, freq_factors
                );
        } else {
            rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, inv_ndims, freq_factors
                );
        }
    } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
+        if (freq_factors == nullptr) {
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+            rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
-            theta_scale, inv_ndims
+                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-        );
+                theta_scale, inv_ndims, freq_factors
                );
        } else {
            rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, inv_ndims, freq_factors
                );
        }
    }
 }
@ -214,24 +230,27 @@ static void rope_cuda_f32(
 static void rope_neox_cuda_f16(
    const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
+    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 static void rope_neox_cuda_f32(
    const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
 ) {
-    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
+    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const ggml_tensor * src2 = dst->src[2];
    const float * src0_d = (const float *)src0->data;
    const float * src1_d = (const float *)src1->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
@ -241,7 +260,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t nrows = ggml_nrows(src0);
    //const int n_past      = ((int32_t *) dst->op_params)[0];
@ -259,16 +277,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
    const float * freq_factors = nullptr;
    const int32_t * pos = nullptr;
    if ((mode & 1) == 0) {
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
        GGML_ASSERT(src1->ne[0] == ne2);
        pos = (const int32_t *) src1_d;
    }
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    pos = (const int32_t *) src1_d;
    if (is_neox) {
        if (src2 != nullptr) {
            freq_factors = (const float *) src2->data;
        }
    } else {
        GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
    }
    rope_corr_dims corr_dims;
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
@ -280,12 +304,12 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        if (src0->type == GGML_TYPE_F32) {
            rope_neox_cuda_f32(
                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+                attn_factor, corr_dims, freq_factors, stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_neox_cuda_f16(
                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+                attn_factor, corr_dims, freq_factors, stream
            );
        } else {
            GGML_ASSERT(false);
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`
							`add_executable(rpc-server rpc-server.cpp)`
							`target_link_libraries(rpc-server PRIVATE ggml llama)`
		`@ -0,0 +1,3 @@`
							`#include "common.cuh"`

							`void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`
		`@ -0,0 +1,3 @@`
							`#include "common.cuh"`

							`void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`
		`@ -0,0 +1,3 @@`
							`#include "common.cuh"`

							`void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`