Merge branch 'ggerganov:master' into bitnet

2025-01-13 05:42:22 +01:00 · 2024-06-10 10:51:47 +08:00 · 2024-06-10 10:51:47 +08:00 · 841c903ff9
commit 841c903ff9
parent abd798d70f 10ceba354a
218 changed files with 5021 additions and 8134 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/main /main
 ENTRYPOINT [ "/main" ]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -9,10 +9,13 @@ WORKDIR /app
 COPY . .
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 FROM ubuntu:$UBUNTU_VERSION as runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/main /main
 ENV LC_ALL=C.utf8
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev libgomp1
 COPY --from=build /app/server /server
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -11,12 +11,12 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev libgomp1
 COPY --from=build /app/server /server
--- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@ -0,0 +1,5 @@
 - Self Reported Review Complexity:
    - [ ] Review Complexity : Low
    - [ ] Review Complexity : Medium
    - [ ] Review Complexity : High
 - [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -688,8 +688,6 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1
@ -706,8 +704,6 @@ jobs:
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas-x64'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
@ -732,27 +728,6 @@ jobs:
        run: |
          git submodule update --init kompute
      - name: Download OpenCL SDK
        id: get_opencl
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
          mkdir $env:RUNNER_TEMP/opencl
          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
      - name: Download CLBlast
        id: get_clblast
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
            $txt = Get-Content -Path $f -Raw
            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
          }
      - name: Download OpenBLAS
        id: get_openblas
        if: ${{ matrix.build == 'openblas-x64' }}
@ -786,13 +761,6 @@ jobs:
          cmake -S . -B build ${{ matrix.defines }}
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Add clblast.dll
        id: add_clblast_dll
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
      - name: Add libopenblas.dll
        id: add_libopenblas_dll
        if: ${{ matrix.build == 'openblas-x64' }}
@ -816,7 +784,7 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
@ -1071,7 +1039,7 @@ jobs:
 #        hypervisor: 'qemu'
 #        run: |
 #            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
 #            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
  release:
--- a/.gitignore
+++ b/.gitignore
@ -34,9 +34,11 @@ ggml-metal-embed.metal
 lcov-report/
 gcovr-report/
 tags
 build*
 !build.zig
 cmake-build-*
 android-ndk-*
 out/
 tmp/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -111,7 +111,6 @@ option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for Flas
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
@ -417,6 +416,8 @@ if (LLAMA_CUDA)
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
        list(APPEND GGML_SOURCES_CUDA ${SRCS})
        file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
        list(APPEND GGML_SOURCES_CUDA ${SRCS})
        add_compile_definitions(GGML_USE_CUDA)
        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@ -502,22 +503,6 @@ if (LLAMA_RPC)
    set(GGML_SOURCES_RPC ggml-rpc.cpp)
 endif()
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")
        set(GGML_HEADERS_OPENCL ggml-opencl.h)
        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
        add_compile_definitions(GGML_USE_CLBLAST)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
    else()
        message(WARNING "CLBlast not found")
    endif()
 endif()
 if (LLAMA_VULKAN)
    find_package(Vulkan)
    if (Vulkan_FOUND)
@ -557,12 +542,17 @@ if (LLAMA_VULKAN)
 endif()
 if (LLAMA_HIPBLAS)
-    if ($ENV{ROCM_PATH})
+    if (NOT EXISTS $ENV{ROCM_PATH})
-        set(ROCM_PATH $ENV{ROCM_PATH})
+        if (NOT EXISTS /opt/rocm)
            set(ROCM_PATH /usr)
        else()
            set(ROCM_PATH /opt/rocm)
        endif()
    else()
-        set(ROCM_PATH /opt/rocm)
+        set(ROCM_PATH $ENV{ROCM_PATH})
    endif()
    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
    # CMake on Windows doesn't support the HIP language yet
    if(WIN32)
@ -600,6 +590,8 @@ if (LLAMA_HIPBLAS)
    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
    file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
    list(APPEND GGML_SOURCES_ROCM ${SRCS})
    file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
    list(APPEND GGML_SOURCES_ROCM ${SRCS})
    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@ -1260,7 +1252,6 @@ add_library(ggml OBJECT
            ggml-quants.c
            ggml-quants.h
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
@ -1348,8 +1339,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
+        "${GGML_HEADERS_CUDA}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
+        "${GGML_HEADERS_METAL}"
        "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,14 @@
 # Contributing Guidelines
 ## Checklist
 * Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
 * Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
 * Execute [the full CI locally on your machine](ci/README.md) before publishing
 ## PR formatting
 * Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
 * If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
 * When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
--- a/24
+++ b/24
@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 # Binaries only useful for tests
@ -444,6 +444,7 @@ ifdef LLAMA_CUBLAS
 endif
 OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
 OBJS_CUDA_TEMP_INST     += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
 ifdef LLAMA_CUDA_FA_ALL_QUANTS
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
 else
@ -547,23 +548,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
 	$(NVCC_COMPILE)
 endif # LLAMA_CUDA
 ifdef LLAMA_CLBLAST
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		MK_LDFLAGS += -lclblast -framework OpenCL
 	else
 		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef LLAMA_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS += -lvulkan
@ -914,10 +898,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/README-sycl.md
+++ b/README-sycl.md
@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
 When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
 ## News
--- a/README.md
+++ b/README.md
@ -77,7 +77,7 @@ variety of hardware - locally and in the cloud.
 - AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support
+- Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
      cmake --build build --config Debug
      ```
 - Using `Zig` (version 0.11 or later):
    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
    it's also possible to cross compile for other operating systems and architectures:
    ```bash
    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```
    The `zig targets` command will give you valid options to use.
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -382,16 +371,11 @@ In order to build llama.cpp you have four different options.
    3. Install compilation dependencies.
        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
+        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
            opencl clblast openblas
        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
    the instructions for use and activate this options in this document below.
 ### Homebrew
 On Mac and Linux, the homebrew package manager can be used via
@ -410,7 +394,7 @@ argument.
 ### BLAS Build
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
 - #### Accelerate Framework:
@ -564,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 - #### CLBlast
  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
    - <details>
        <summary>Installing the OpenCL SDK from source</summary>
        ```sh
        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
        cd OpenCL-SDK
        cmake -B build -DBUILD_DOCS=OFF \
          -DBUILD_EXAMPLES=OFF \
          -DBUILD_TESTING=OFF \
          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
          -DOPENCL_SDK_TEST_SAMPLES=OFF
        cmake --build build
        cmake --install build --prefix /some/path
        ```
      </details>
  ##### Installing CLBlast
  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
  Linux packaging:
  Fedora Linux:
  ```bash
  sudo dnf install clblast
  ```
  Alternatively, they may be built from source.
  - <details>
    <summary>Windows:</summary>
      ```cmd
      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
      git clone https://github.com/CNugteren/CLBlast.git
      cd CLBlast
      cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
      cmake --build build --config Release
      cmake --install build --prefix C:/CLBlast
      ```
      (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
  - <details>
    <summary>Unix:</summary>
      ```sh
      git clone https://github.com/CNugteren/CLBlast.git
      cd CLBlast
      cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
      cmake --build build --config Release
      cmake --install build --prefix /some/path
      ```
      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
    </details>
  ##### Building Llama with CLBlast
  - Build with make:
    ```sh
    make LLAMA_CLBLAST=1
    ```
  - CMake (Unix):
    ```sh
    cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
    cmake --build build --config Release
    ```
  - CMake (Windows):
    ```cmd
    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
    cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
    cmake --build build --config Release
    cmake --install build --prefix C:/LlamaCPP
    ```
  ##### Running Llama with CLBlast
  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
  The selection can be a number (starting from 0) or a text string to search:
  ```sh
  GGML_OPENCL_PLATFORM=1 ./main ...
  GGML_OPENCL_DEVICE=2 ./main ...
  GGML_OPENCL_PLATFORM=Intel ./main ...
  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
  ```
  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
  Using the variables it is possible to select a CPU-based driver as well, if so desired.
  You can get a list of platforms and devices from the `clinfo -l` command, etc.
 - #### Vulkan
  **With docker**:
@ -719,7 +598,7 @@ Building the program with BLAS support may lead to some performance improvements
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
 It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 ```bash
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -84,4 +84,4 @@ endif ()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -56,66 +56,67 @@ struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       = -1;
+    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
+    int32_t n_threads_batch_draft =    -1;
-    int32_t n_predict             = -1;    // new tokens to predict
+    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
+    int32_t n_ctx                 =     0; // context size
-    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
+    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
+    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
-    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            = 1;     // number of parallel sequences to decode
+    int32_t n_parallel            =     1; // number of parallel sequences to decode
-    int32_t n_sequences           = 1;     // number of sequences to decode
+    int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               = 0.1f;  // speculative decoding split probability
+    float   p_split               =  0.1f; // speculative decoding split probability
-    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
-    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
-    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
+    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
-    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
+    int32_t grp_attn_n            =     1; // group-attention factor
-    int32_t grp_attn_n            = 1;     // group-attention factor
+    int32_t grp_attn_w            =   512; // group-attention width
-    int32_t grp_attn_w            = 512;   // group-attention width
+    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
-    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =  0.0f; // RoPE base frequency
-    float   rope_freq_base        = 0.0f;  // RoPE base frequency
+    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
+    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
+    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
    std::string rpc_servers       = "";    // comma separated list of RPC servers
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    // // sampling parameters
    struct llama_sampling_params sparams;
-    std::string model                = "";  // model path
+    std::string model                = ""; // model path
-    std::string model_draft          = "";  // draft model for speculative decoding
+    std::string model_draft          = ""; // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = "";  // model url to download
+    std::string model_url            = ""; // model url to download
-    std::string hf_repo              = "";  // HF repo
+    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = "";  // HF file
+    std::string hf_file              = ""; // HF file
    std::string prompt               = "";
-    std::string prompt_file          = "";  // store the external prompt file name
+    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = "";  // string to prefix user inputs with
+    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string logdir               = ""; // directory in which to save YAML log files
    std::string logdir               = "";  // directory in which to save YAML log files
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = "";  // file for saving *all* logits
+    std::string logits_file          = ""; // file for saving *all* logits
    std::string rpc_servers          = ""; // comma separated list of RPC servers
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
    // TODO: avoid tuple, use struct
@ -124,37 +125,36 @@ struct gpt_params {
    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                    //                                       (which is more convenient to use for plotting)
+                                     //                                       (which is more convenient to use for plotting)
-                                    //
+                                     //
-    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
-    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
-    bool   kl_divergence   = false; // compute KL divergence
+    bool   kl_divergence    = false; // compute KL divergence
-    bool random_prompt     = false; // do not randomize prompt if none provided
+    bool usage             = false; // print usage
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
    bool embedding         = false; // get only sentence embedding
-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
@ -162,7 +162,6 @@ struct gpt_params {
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
@ -180,6 +179,59 @@ struct gpt_params {
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)
    // server params
    int32_t port           = 8080;         // server listens on this network port
    int32_t timeout_read   = 600;          // http read timeout in seconds
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";
    std::string chat_template = "";
    std::string system_prompt = "";
    std::vector<std::string> api_keys;
    std::string ssl_file_key  = "";
    std::string ssl_file_cert = "";
    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
    bool log_json = false;
    std::string slot_save_path;
    float slot_prompt_similarity = 0.5f;
    // batched-bench params
    bool is_pp_shared = false;
    std::vector<int32_t> n_pp;
    std::vector<int32_t> n_tg;
    std::vector<int32_t> n_pl;
    // retrieval params
    std::vector<std::string> context_files; // context files to embed
    int32_t chunk_size = 64; // chunk size for context embedding
    std::string chunk_separator = "\n"; // chunk separator for context embedding
    // passkey params
    int32_t n_junk = 250; // number of times to repeat the junk text
    int32_t i_pos  = -1;  // position of the passkey in the junk text
    // imatrix params
    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
 };
 void gpt_params_handle_model_default(gpt_params & params);
@ -199,7 +251,20 @@ std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
-std::string string_random_prompt(std::mt19937 & rng);
+
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    std::vector<T> values;
    std::istringstream str_stream(str);
    std::string token;
    while (std::getline(str_stream, token, delim)) {
        T value;
        std::istringstream token_stream(token);
        token_stream >> value;
        values.push_back(value);
    }
    return values;
 }
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@ -212,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
 //
 // Model utils
@ -282,6 +348,13 @@ std::string llama_detokenize_bpe(
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);
 //
 // Chat template utils
 //
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool llama_chat_verify_template(const std::string & tmpl);
 //
 // KV cache utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -46,8 +46,12 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }
    static bool is_digit_char(char c) {
        return '0' <= c && c <= '9';
    }
    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
    }
    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@ -99,6 +103,17 @@ namespace grammar_parser {
        return pos;
    }
    static const char * parse_int(const char * src) {
        const char * pos = src;
        while (is_digit_char(*pos)) {
            pos++;
        }
        if (pos == src) {
            throw std::runtime_error(std::string("expecting integer at ") + src);
        }
        return pos;
    }
    static std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
@ -137,6 +152,60 @@ namespace grammar_parser {
            bool                                 is_nested) {
        size_t last_sym_start = out_elements.size();
        const char * pos = src;
        auto handle_repetitions = [&](int min_times, int max_times) {
            if (last_sym_start == out_elements.size()) {
                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
            }
            // apply transformation to previous symbol (last_sym_start to end) according to
            // the following rewrite rules:
            // S{m,n} --> S S S (m times) S'(n-m)
            //            S'(x)   ::= S S'(x-1) |
            //            (... n-m definitions of these S' rules ...)
            //            S'(1)   ::= S |
            // S{m,} -->  S S S (m times) S'
            //            S'     ::= S S' |
            // S*     --> S{0,}
            //        --> S'     ::= S S' |
            // S+     --> S{1,}
            //        --> S S'
            //            S'     ::= S S' |
            // S?     --> S{0,1}
            //        --> S'
            //            S'     ::= S |
            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
            if (min_times == 0) {
                out_elements.resize(last_sym_start);
            } else {
                // Repeat the previous elements (min_times - 1) times
                for (int i = 1; i < min_times; i++) {
                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
                }
            }
            uint32_t last_rec_rule_id = 0;
            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
            std::vector<llama_grammar_element> rec_rule(previous_elements);
            for (int i = 0; i < n_opt; i++) {
                rec_rule.resize(previous_elements.size());
                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
                if (i > 0 || max_times < 0) {
                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
                }
                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
                add_rule(state, rec_rule_id, rec_rule);
                last_rec_rule_id = rec_rule_id;
            }
            if (n_opt > 0) {
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
            }
        };
        while (*pos) {
            if (*pos == '"') { // literal string
                pos++;
@ -197,40 +266,51 @@ namespace grammar_parser {
                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                }
                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+            } else if (*pos == '.') { // any char
-                if (last_sym_start == out_elements.size()) {
+                last_sym_start = out_elements.size();
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
                }
                // apply transformation to previous symbol (last_sym_start to end) according to
                // rewrite rules:
                // S* --> S' ::= S S' |
                // S+ --> S' ::= S S' | S
                // S? --> S' ::= S |
                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
                std::vector<llama_grammar_element> sub_rule;
                // add preceding symbol to generated rule
                sub_rule.insert(
                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                if (*pos == '*' || *pos == '+') {
                    // cause generated rule to recurse
                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                }
                // mark start of alternate def
                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
                if (*pos == '+') {
                    // add preceding symbol as alternate only for '+' (otherwise empty)
                    sub_rule.insert(
                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                }
                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
                add_rule(state, sub_rule_id, sub_rule);
                // in original rule, replace previous symbol with reference to generated rule
                out_elements.resize(last_sym_start);
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(0, -1);
            } else if (*pos == '+') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(1, -1);
            } else if (*pos == '?') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(0, 1);
            } else if (*pos == '{') {
                pos = parse_space(pos + 1, is_nested);
                if (!is_digit_char(*pos)) {
                    throw std::runtime_error(std::string("expecting an int at ") + pos);
                }
                const char * int_end = parse_int(pos);
                int min_times = std::stoul(std::string(pos, int_end - pos));
                pos = parse_space(int_end, is_nested);
                int max_times = -1;
                if (*pos == '}') {
                    max_times = min_times;
                    pos = parse_space(pos + 1, is_nested);
                } else if (*pos == ',') {
                    pos = parse_space(pos + 1, is_nested);
                    if (is_digit_char(*pos)) {
                        const char * int_end = parse_int(pos);
                        max_times = std::stoul(std::string(pos, int_end - pos));
                        pos = parse_space(int_end, is_nested);
                    }
                    if (*pos != '}') {
                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
                    }
                    pos = parse_space(pos + 1, is_nested);
                } else {
                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
                }
                handle_repetitions(min_times, max_times);
            } else {
                break;
            }
@ -325,6 +405,7 @@ namespace grammar_parser {
            case LLAMA_GRETYPE_CHAR_NOT:       return true;
            case LLAMA_GRETYPE_CHAR_ALT:       return true;
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
            case LLAMA_GRETYPE_CHAR_ANY:       return true;
            default:                           return false;
        }
    }
@ -339,6 +420,7 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
            }
            switch (elem.type) {
                case LLAMA_GRETYPE_END:
@ -350,6 +432,7 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:
                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                case LLAMA_GRETYPE_CHAR_ALT:
                case LLAMA_GRETYPE_CHAR_ANY:
                    fprintf(file, "(\"");
                    print_grammar_char(file, elem.value);
                    fprintf(file, "\") ");
@ -407,11 +490,15 @@ namespace grammar_parser {
                    }
                    print_grammar_char(file, elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR_ANY:
                    fprintf(file, ".");
                    break;
            }
            if (is_char_element(elem)) {
                switch (rule[i + 1].type) {
                    case LLAMA_GRETYPE_CHAR_ALT:
                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                    case LLAMA_GRETYPE_CHAR_ANY:
                        break;
                    default:
                        fprintf(file, "] ");
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
 static std::string repeat(const std::string & str, size_t n);
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
    if (separator_rule.empty()) {
-        if (min_items == 0 && max_items == 1) {
+        if (min_items == 1 && !has_max) {
            return item_rule + "?";
        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
            return item_rule + "+";
-        }
+        } else if (min_items == 0 && !has_max) {
-    }
+            return item_rule + "*";
    std::string result;
    if (min_items > 0) {
        if (item_rule_is_literal && separator_rule.empty()) {
            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
        } else {
-            std::vector<std::string> items(min_items, item_rule);
+            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
        }
    }
-    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
-        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+    if (min_items == 0) {
-
+        result = "(" + result + ")?";
        if (up_to_n == 0) {
            return "";
        } else if (up_to_n == 1) {
            return "(" + content + ")?";
        } else if (!separator_rule.empty() && !prefix_with_sep) {
            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
        } else {
            std::string res = repeat("(" + content + " ", up_to_n);
            // strip trailing space
            res = res.substr(0, res.length() - 1);
            res += repeat(")?", up_to_n);
            return res;
        }
    };
    if (min_items > 0 && max_items != min_items) {
        result += " ";
    }
    if (max_items != std::numeric_limits<int>::max()) {
        result += opt_repetitions(max_items - min_items, min_items > 0);
    } else {
        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
        if (min_items == 0 && !separator_rule.empty()) {
            result = "(" + item_rule + " " + item_operator + "*)?";
        } else {
            result += item_operator + "*";
        }
    }
    return result;
 }
@ -78,30 +47,24 @@ struct BuiltinRule {
    std::vector<std::string> deps;
 };
 const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
 std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"decimal-part", {"[0-9]{1,16}", {}}},
-    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
    {"null", {"\"null\" space", {}}},
 };
 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@ -385,8 +348,7 @@ private:
                        sub_is_literal ? "\"" + sub + "\"" : sub,
                        min_times,
                        max_times,
-                        "",
+                        ""
                        sub_is_literal
                    );
                    seq.back().second = false;
                } else {
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@ -82,6 +83,7 @@ models = [
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from __future__ import annotations
@ -46,11 +47,12 @@ class Model:
    _model_classes: dict[str, type[Model]] = {}
    dir_model: Path
-    ftype: int
+    ftype: gguf.LlamaFileType
    is_big_endian: bool
    endianess: gguf.GGUFEndian
    use_temp_file: bool
    lazy: bool
    model_name: str | None
    part_names: list[str]
    is_safetensors: bool
    hparams: dict[str, Any]
@ -63,7 +65,7 @@ class Model:
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
        self.dir_model = dir_model
@ -72,10 +74,11 @@ class Model:
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
        self.lazy = not eager
-        self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
+        self.model_name = model_name
        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
        self.is_safetensors = len(self.part_names) > 0
        if not self.is_safetensors:
-            self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
+            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
        self.hparams = Model.load_hparams(self.dir_model)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -93,7 +96,7 @@ class Model:
        ftype_lw: str = ftype_up.lower()
        # allow templating the file name with the output ftype, useful with the "auto" ftype
        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
-        self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
    @classmethod
    def __init_subclass__(cls):
@ -181,7 +184,7 @@ class Model:
        return new_name
    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.block_count)
        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -323,21 +326,21 @@ class Model:
    def write(self):
        self.write_tensors()
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.write_tensors_to_file(progress=True)
        self.gguf_writer.close()
    def write_vocab(self):
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.close()
    @staticmethod
-    def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
+    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
        part_names: list[str] = []
        for filename in os.listdir(dir_model):
-            if filename.endswith(suffix):
+            if filename.startswith(prefix) and filename.endswith(suffix):
                part_names.append(filename)
        part_names.sort()
@ -474,6 +477,9 @@ class Model:
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
            res = "smaug-bpe"
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if res is None:
            logger.warning("\n")
@ -661,7 +667,7 @@ class GPTNeoXModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -794,7 +800,7 @@ class MPTModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_block_count(block_count)
@ -846,7 +852,7 @@ class OrionModel(Model):
            raise ValueError("gguf: can not find ctx length parameter.")
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -883,7 +889,7 @@ class BaichuanModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -1006,7 +1012,7 @@ class XverseModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -1202,7 +1208,7 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -1717,7 +1723,7 @@ class GPT2Model(Model):
    model_arch = gguf.MODEL_ARCH.GPT2
    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2284,7 +2290,7 @@ class GemmaModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -2384,7 +2390,7 @@ class MambaModel(Model):
        # Fail early for models which don't have a block expansion factor of 2
        assert d_inner == 2 * d_model
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
        self.gguf_writer.add_embedding_length(d_model)
        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2491,11 +2497,13 @@ class JinaBertV2Model(BertModel):
    def get_tensors(self):
        for name, data in super().get_tensors():
-            if 'gated_layers' in name:
+            if 'gated_layer' in name:
                d1 = data[:self.intermediate_size, :]
                name1 = name.replace('gated_layers', 'gated_layers_w')
                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
                d2 = data[self.intermediate_size:, :]
                name2 = name.replace('gated_layers', 'gated_layers_v')
                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
                yield name1, d1
                yield name2, d2
                continue
@ -2886,7 +2894,7 @@ def main() -> None:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
        logger.info("Set model parameters")
        model_instance.set_gguf_parameters()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -15,7 +15,6 @@ else()
    add_subdirectory(baby-llama)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
    add_subdirectory(beam-search)
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, 1]
            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
            // store key and value to memory
            {
@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wqb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0);
            struct ggml_tensor * Kcur = ggml_rope(ctx0,
                                            ggml_reshape_3d(ctx0,
                                                ggml_mul_mat(ctx0,
@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wkb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0);
            // store key and value to memory
            {
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 ## Sample results
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
    return ret;
 }
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (argc == 1 || argv[1][0] == '-') {
+    if (!gpt_params_parse(argc, argv, params)) {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        print_usage(argc, argv, params);
-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
+        return 1;
        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }
-    int n_kv_max     = 2048;
+    int is_pp_shared = params.is_pp_shared;
    int n_batch      = 2048;
    int n_ubatch     = 512;
    bool flash_attn  = false;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;
-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
+    std::vector<int> n_pp = params.n_pp;
-    std::vector<int> n_tg = { 128, 256, };
+    std::vector<int> n_tg = params.n_tg;
-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
+    std::vector<int> n_pl = params.n_pl;
    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        n_kv_max = std::atoi(argv[2]);
    }
    if (argc >= 4) {
        n_batch = std::atoi(argv[3]);
    }
    if (argc >= 5) {
        n_ubatch = std::atoi(argv[4]);
    }
    if (argc >= 6) {
        flash_attn = std::atoi(argv[5]);
    }
    if (argc >= 7) {
        is_pp_shared = std::atoi(argv[6]);
    }
    if (argc >= 8) {
        n_gpu_layers = std::atoi(argv[7]);
    }
    if (argc >= 9) {
        n_pp = parse_list(argv[8]);
    }
    if (argc >= 10) {
        n_tg = parse_list(argv[9]);
    }
    if (argc >= 11) {
        n_pl = parse_list(argv[10]);
    }
    // init LLM
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
    // initialize the model
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    const std::vector<float> t_split(llama_max_devices(), 0.0f);
    model_params.n_gpu_layers = n_gpu_layers;
    model_params.tensor_split = t_split.data();
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
    ctx_params.seed       = 1234;
    ctx_params.n_ctx      = n_kv_max;
    ctx_params.n_batch    = n_batch;
    ctx_params.n_ubatch   = n_ubatch;
    ctx_params.flash_attn = flash_attn;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    // ensure enough sequences are available
    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
        return 1;
    }
    const int32_t n_kv_max = llama_n_ctx(ctx);
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
    // decode in batches of ctx_params.n_batch tokens
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 ```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 ...
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -7,48 +7,31 @@
 #include <string>
 #include <vector>
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (argc == 1 || argv[1][0] == '-') {
+    params.prompt = "Hello my name is";
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
+    params.n_predict = 32;
-        return 1 ;
+
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    // number of parallel batches
-    int n_parallel = 1;
+    int n_parallel = params.n_parallel;
    // total length of the sequences including the prompt
-    int n_len = 32;
+    int n_predict = 32;
    // number of layers to offload to the GPU
    int n_gpu_layers = 0;
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        params.prompt = argv[2];
    }
    if (argc >= 4) {
        n_parallel = std::atoi(argv[3]);
    }
    if (argc >= 5) {
        n_len = std::atoi(argv[4]);
    }
    if (argc >= 6) {
        n_gpu_layers = std::atoi(argv[5]);
    }
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
    string_process_escapes(params.prompt);
    // init LLM
@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
    // initialize the model
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    model_params.n_gpu_layers = n_gpu_layers;
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
    // initialize the context
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
    ctx_params.seed  = 1234;
    ctx_params.n_ctx   = n_kv_req;
-    ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_batch = std::max(n_predict, n_parallel);
    ctx_params.n_seq_max       = n_parallel;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_ctx = llama_n_ctx(ctx);
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
    const auto t_main_start = ggml_time_us();
-    while (n_cur <= n_len) {
+    while (n_cur <= n_predict) {
        // prepare the next batch
        llama_batch_clear(batch);
@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
            // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
--- a/examples/beam-search/CMakeLists.txt
+++ b/examples/beam-search/CMakeLists.txt
@ -1,5 +0,0 @@
 set(TARGET beam-search)
 add_executable(${TARGET} beam-search.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -1,188 +0,0 @@
 #include "common.h"
 #include "llama.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #include <signal.h>
 #endif
 // Used for debugging to print out beam tokens.
 struct ostream_beam_view {
    llama_context * ctx;
    llama_beam_view beam_view;
 };
 static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
    }
    return os << ')';
 }
 // Put here anything you want back in beam_search_callback().
 struct beam_search_callback_data {
    llama_context * ctx;
    std::vector<llama_token> response;
 };
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
 }
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        callback_data.response.resize(callback_data.response.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
        printf("%zu", n);
    }
    fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
    }
 #endif
 }
 int main(int argc, char ** argv)
 {
    gpt_params params;
    //params.n_gpu_layers = 200;
    //---------------------------------
    // Print help :
    //---------------------------------
    if ( argc < 2 || argv[1][0] == '-' )
    {
        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
        return 1 ;
    }
    //---------------------------------
    // Load parameters :
    //---------------------------------
    params.model = argv[1];
    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
    if ( argc > 3 )
    {
        params.prompt = argv[3];
    }
    if ( params.prompt.empty() )
    {
        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
    }
    //---------------------------------
    // Init LLM :
    //---------------------------------
    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params( params );
    if ( model == NULL )
    {
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
    //---------------------------------
    // Tokenize the prompt :
    //---------------------------------
    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
    const size_t max_context_size     = llama_n_ctx( ctx );
    const size_t max_tokens_list_size = max_context_size - 4 ;
    if (tokens_list.size() > max_tokens_list_size)
    {
        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }
    fprintf( stderr, "\n\n" );
    // Print the tokens from the prompt :
    for( auto id : tokens_list )
    {
        std::cout << llama_token_to_piece(ctx, id);
    }
    std::cout << std::flush;
    int n_past = 0;
    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
    }
    n_past += tokens_list.size();
    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
        std::cout << llama_token_to_piece(ctx,token_id);
    }
    std::cout << std::endl;
    llama_free( ctx );
    llama_free_model( model );
    llama_backend_free();
    return 0;
 }
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@ -176,7 +176,7 @@ class Params:
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
    f_rope_scale: float | None = None
-    n_orig_ctx: int | None = None
+    n_ctx_orig: int | None = None
    rope_finetuned: bool | None = None
    ftype: GGMLFileType | None = None
@ -226,7 +226,7 @@ class Params:
        with open(config_path) as f:
            config = json.load(f)
-        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")
        if rope_scaling is not None and (typ := rope_scaling.get("type")):
@ -236,7 +236,7 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                n_ctx_orig = rope_scaling['original_max_position_embeddings']
                rope_finetuned = rope_scaling['finetuned']
            else:
                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@ -272,7 +272,7 @@ class Params:
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
            f_rope_scale      = f_rope_scale,
-            n_orig_ctx        = n_orig_ctx,
+            n_ctx_orig        = n_ctx_orig,
            rope_finetuned    = rope_finetuned,
        )
@ -864,8 +864,8 @@ class OutputFile:
            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
-        if params.n_orig_ctx is not None:
+        if params.n_ctx_orig is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
        if params.rope_finetuned is not None:
            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 }
 int main(int argc, char ** argv) {
    callback_data cb_data;
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
    print_build_info();
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const int rope_mode = 0;
        return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
+            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1000 * 1000; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
@ -284,7 +284,7 @@ struct split_strategy {
                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
                total_size += ggml_nbytes(t);
            }
-            total_size = total_size / 1024 / 1024; // convert to megabytes
+            total_size = total_size / 1000 / 1000; // convert to megabytes
            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
            i_split++;
        }
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -41,7 +41,7 @@ echo PASS
 echo
 # 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo
@ -51,7 +51,7 @@ echo PASS
 echo
 # 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
 echo PASS
 echo
@ -61,7 +61,7 @@ echo PASS
 echo
 # 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
 echo PASS
 echo
@ -71,7 +71,7 @@ echo
 #echo
 # 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
 #echo PASS
 #echo
@ -81,7 +81,7 @@ echo PASS
 echo
 # 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
 echo PASS
 echo
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage
 ```
-./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
+./imatrix \
-        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
 ```
 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -17,39 +17,37 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s \\\n"
            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
    LOG_TEE("\n");
 }
 struct Stats {
    std::vector<float> values;
    std::vector<int> counts;
    int ncall = 0;
 };
 struct StatParams {
    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
    int         keep_every = 0;
    bool        collect_output_weight = false;
 };
 class IMatrixCollector {
 public:
    IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
+    void set_params(gpt_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
+    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * file_name, bool add);
+    bool load_imatrix(const char * file_name);
    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
 private:
    std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
+    gpt_params                             m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name, const char * dataset) const;
    void keep_imatrix(int ncall) const;
 };
 // remove any prefix and suffixes from the name
@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (t->op != GGML_OP_MUL_MAT) return false;
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
        return true;
    }
@ -153,21 +151,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
                        if (!std::isfinite(e.values[e_start + j])) {
                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
            if (e.ncall > m_last_call) {
                m_last_call = e.ncall;
-                if (m_last_call % m_params.n_output_frequency == 0) {
+                if (m_last_call % m_params.n_out_freq == 0) {
                    save_imatrix();
                }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                    keep_imatrix(m_last_call);
+                    save_imatrix(m_last_call);
                }
            }
        }
    } else {
-        auto& e = m_stats[wname];
+        auto & e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
            e.counts.resize(src1->ne[0], 0);
@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
                if (!std::isfinite(e.values[j])) {
                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
        }
        if (e.ncall > m_last_call) {
            m_last_call = e.ncall;
-            if (m_last_call % m_params.n_output_frequency == 0) {
+            if (m_last_call % m_params.n_out_freq == 0) {
                save_imatrix();
            }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                keep_imatrix(m_last_call);
+                save_imatrix(m_last_call);
            }
        }
    }
@ -201,33 +207,75 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }
-void IMatrixCollector::save_imatrix() const {
+void IMatrixCollector::save_imatrix(int ncall) const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
+    auto fname = m_params.out_file;
-}
+    if (fname.empty()) {
        fname = "imatrix.dat";
    }
-void IMatrixCollector::keep_imatrix(int ncall) const {
+    if (ncall > 0) {
-    auto file_name = m_params.ofile;
+        fname += ".at_";
-    if (file_name.empty()) file_name = "imatrix.dat";
+        fname += std::to_string(ncall);
-    file_name += ".at_";
+    }
-    file_name += std::to_string(ncall);
+
-    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
+    // avoid writing imatrix entries that do not have full data
-}
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
    int n_entries = 0;
    std::vector<std::string> to_store;
    bool is_first = true; // for printing
    for (const auto & kv : m_stats) {
        const int n_all = kv.second.counts.size();
        if (n_all == 0) {
            continue;
        }
        int n_zeros = 0;
        for (const int c : kv.second.counts) {
            if (c == 0) {
                n_zeros++;
            }
        }
        if (n_zeros != 0 && is_first) {
            fprintf(stderr, "\n");
            is_first = false;
        }
        if (n_zeros == n_all) {
            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
            continue;
        }
        if (n_zeros > 0) {
            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
            continue;
        }
        n_entries++;
        to_store.push_back(kv.first);
    }
    if (to_store.size() < m_stats.size()) {
        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }
 void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char *) &n_entries, sizeof(n_entries));
-    for (const auto & p : m_stats) {
+    for (const auto & name : to_store) {
-        int len = p.first.size();
+        const auto & stat = m_stats.at(name);
        int len = name.size();
        out.write((const char *) &len, sizeof(len));
-        out.write(p.first.c_str(), len);
+        out.write(name.c_str(), len);
-        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
-        int nval = p.second.values.size();
+        int nval = stat.values.size();
        out.write((const char *) &nval, sizeof(nval));
        if (nval > 0) {
            std::vector<float> tmp(nval);
            for (int i = 0; i < nval; i++) {
-                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
+                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
            }
            out.write((const char*)tmp.data(), nval*sizeof(float));
        }
@ -236,26 +284,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
    // Write the number of call the matrix was computed with
    out.write((const char *) &m_last_call, sizeof(m_last_call));
-    // Write the dataset name at the end of the file to later on specify it in quantize
+    // Write the input filename at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
+    {
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
+        int len = m_params.prompt_file.size();
-    out.write(dataset, n_dataset);
+        out.write((const char *) &len, sizeof(len));
        out.write(m_params.prompt_file.c_str(), len);
    }
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
    }
 }
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
+bool IMatrixCollector::load_imatrix(const char * fname) {
-    std::ifstream in(imatrix_file, std::ios::binary);
+    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        printf("%s: failed to open %s\n",__func__, fname);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        printf("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@ -263,23 +313,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = m_stats[std::move(name)];
        int ncall;
        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
            return false;
        }
        // When re-called from load_imatrix() with add set, this will already be created.
        if (e.values.empty()) {
            e.values.resize(nval, 0);
            e.counts.resize(nval, 0);
@ -289,7 +338,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
            return false;
        }
@ -304,13 +353,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
    return true;
 }
 bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
    if (!add) {
        m_stats.clear();
    }
    return load_imatrix(file_name, m_stats);
 }
 static IMatrixCollector g_collector;
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -324,7 +366,7 @@ struct results_log_softmax {
    float  prob;
 };
-static std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float> & logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) {
@ -358,8 +400,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
 ) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -391,8 +432,7 @@ static void process_logits(
    }
 }
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);
@ -405,13 +445,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
-    if (from_chunk > 0) {
+    if (params.i_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }
    if (int(tokens.size()) < 2*n_ctx) {
@ -424,7 +464,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    std::vector<float> logit_history;
    std::vector<float> prob_history;
-    if (compute_ppl) {
+    if (params.compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
@ -446,7 +486,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
+    if (params.compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }
@ -482,7 +522,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
-            if (compute_ppl && num_batches > 1) {
+            if (params.compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
@ -501,7 +541,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
-        if (compute_ppl) {
+        if (params.compute_ppl) {
            const int first = n_ctx/2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@ -516,7 +556,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    }
    printf("\n");
-    if (compute_ppl) {
+    if (params.compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
@ -533,111 +573,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 }
 int main(int argc, char ** argv) {
    StatParams sparams;
    std::string prev_result_file;
    std::string combine_files;
    bool compute_ppl = true;
    int  from_chunk  = 0;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
    for (; iarg < argc-1; ++iarg) {
        std::string arg{argv[iarg]};
        if (arg == "-o" || arg == "--output-file") {
            sparams.ofile = argv[++iarg];
        }
        else if (arg == "-ofreq" || arg == "--output-frequency") {
            sparams.n_output_frequency = std::stoi(argv[++iarg]);
        }
        else if (arg == "-ow" || arg == "--output-weight") {
            sparams.collect_output_weight = std::stoi(argv[++iarg]);
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
        } else if (arg == "--no-ppl") {
            compute_ppl = false;
        } else if (arg == "--keep-imatrix") {
            sparams.keep_every = std::stoi(argv[++iarg]);
        } else if (arg == "--continue-from") {
            prev_result_file = argv[++iarg];
        } else if (arg == "--combine") {
            combine_files = argv[++iarg];
        }
        else if (arg == "--from-chunk") {
            from_chunk = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
        std::string arg{argv[iarg]};
        if (arg == "--no-ppl") {
            compute_ppl = false;
        } else {
            args.push_back(argv[iarg]);
        }
    }
    gpt_params params;
-    params.n_batch = 512;
+
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
+    params.n_ctx = 512;
    params.logits_all = true;
    params.verbosity = 1;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
-    print_build_info();
+    g_collector.set_params(params);
-    if (params.seed == LLAMA_DEFAULT_SEED) {
+    for (const auto & in_file : params.in_files) {
-        params.seed = time(NULL);
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
-    }
+        if (!g_collector.load_imatrix(in_file.c_str())) {
-
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));
    if (!combine_files.empty()) {
        std::vector<std::string> files;
        size_t pos = 0;
        while (true) {
            auto new_pos = combine_files.find(',', pos);
            if (new_pos != std::string::npos) {
                files.emplace_back(combine_files.substr(pos, new_pos - pos));
                pos = new_pos + 1;
            } else {
                files.emplace_back(combine_files.substr(pos));
                break;
            }
        }
        if (files.size() < 2) {
            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
            return 1;
        }
-        printf("Combining the following %d files\n", int(files.size()));
+    }
-        for (auto& file : files) {
+
-            printf("    %s\n", file.c_str());
+    if (params.in_files.size() > 1) {
-            if (!g_collector.load_imatrix(file.c_str(), true)) {
+        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
                fprintf(stderr, "Failed to load %s\n", file.c_str());
                return 1;
            }
        }
        g_collector.save_imatrix();
        return 0;
    }
    if (!prev_result_file.empty()) {
        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
            return 1;
        }
    }
    llama_backend_init();
@ -652,6 +613,7 @@ int main(int argc, char ** argv) {
    // init
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
@ -670,8 +632,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
-    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
+    if (!compute_imatrix(ctx, params)) {
    if (!OK) {
        return 1;
    }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
    g_params = &params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    if (params.instruct) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.chatml) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.antiprompt.empty()) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
        printf("\n************\n");
        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
        return 0;
    }
    if (params.random_prompt) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.path_prompt_cache.empty()) {
        printf("\n************\n");
        printf("%s: infill does not support prompt caching\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.rope_freq_base != 0.0) {
        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+
    g_model = &model;
    g_ctx = &ctx;
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
    // Tokenize negative prompt
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        if (ctx_guidance) {
            LOG_TEE("\n");
            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }
        if (params.n_keep > 0) {
        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
        is_interacting = params.interactive_first;
    }
-    bool input_echo           = true;
+    bool input_echo = true;
-    int n_past             = 0;
+    int n_past     = 0;
-    int n_remain           = params.n_predict;
+    int n_remain   = params.n_predict;
-    int n_consumed         = 0;
+    int n_consumed = 0;
    int n_past_guidance    = 0;
    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
    console::set_display(console::prompt);
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            if (n_past + (int) embd.size() > n_ctx) {
                if (params.n_predict == -2) {
                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
                n_past -= n_discard;
-                if (ctx_guidance) {
+                LOG("after swap: n_past = %d\n", n_past);
                    n_past_guidance -= n_discard;
                }
                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
            if (ctx_guidance) {
                int input_size = 0;
                llama_token * input_buf = NULL;
                if (n_past_guidance < (int) guidance_inp.size()) {
                    // Guidance context should have the same data with these modifications:
                    //
                    // * Replace the initial prompt
                    // * Shift everything by guidance_offset
                    embd_guidance = guidance_inp;
                    if (embd.begin() + original_prompt_len < embd.end()) {
                        embd_guidance.insert(
                            embd_guidance.end(),
                            embd.begin() + original_prompt_len,
                            embd.end()
                        );
                    }
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();
                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
                }
                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
                        LOG_TEE("%s : failed to eval\n", __func__);
                        return 1;
                    }
                    n_past_guidance += n_eval;
                }
            }
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
        }
        embd.clear();
        embd_guidance.clear();
        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
            llama_sampling_accept(ctx_sampling, ctx, id, true);
@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                embd_inp.push_back(llama_token_middle(model));
                embd.clear();
                embd_guidance.clear();
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -6,52 +6,22 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
    if min_items == 0 and max_items == 1:
        return f'{item_rule}?'
    if not separator_rule:
-        if min_items == 0 and max_items == 1:
+        if min_items == 1 and max_items is None:
            return f'{item_rule}?'
        elif min_items == 1 and max_items is None:
            return f'{item_rule}+'
-
+        elif min_items == 0 and max_items is None:
-    result = ''
+            return f'{item_rule}*'
    if min_items > 0:
        if item_rule_is_literal and separator_rule is None:
            result = '"' + (item_rule[1:-1] * min_items) + '"'
        else:
-            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
-    def opt_repetitions(up_to_n, prefix_with_sep=False):
+    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
-        '''
+    return f'({result})?' if min_items == 0 else result
            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
        '''
        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
        if up_to_n == 0:
            return ''
        elif up_to_n == 1:
            return f'({content})?'
        elif separator_rule and not prefix_with_sep:
            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
        else:
            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
    if min_items > 0 and max_items != min_items:
        result += ' '
    if max_items is not None:
        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
    else:
        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
        if min_items == 0 and separator_rule:
            result = f'({item_rule} {item_operator}*)?'
        else:
            result += f'{item_operator}*'
    return result
 class BuiltinRule:
@ -59,31 +29,29 @@ class BuiltinRule:
        self.content = content
        self.deps = deps or []
 _up_to_15_digits = _build_repetition('[0-9]', 0, 15)
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'
 PRIMITIVE_RULES = {
    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
-    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
    'null'         : BuiltinRule('"null" space', []),
 }
 # TODO: support "uri", "email" string formats
 STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
@ -333,7 +301,7 @@ class SchemaConverter:
                            sub_rule_ids[sub] = id
                        sub = id
-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
                else:
                    literal = ''
                    while i < length:
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -162,7 +162,7 @@ $ ./llama-bench -o csv
 ```
 ```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@ -179,7 +179,6 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@ -210,7 +209,6 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
  cuda INTEGER,
  opencl INTEGER,
  metal INTEGER,
  gpu_blas INTEGER,
  blas INTEGER,
@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
  stddev_ts REAL
 );
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -41,20 +41,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
    return str.str();
 }
 template<class T>
 static std::vector<T> split(const std::string & str, char delim) {
    std::vector<T> values;
    std::istringstream str_stream(str);
    std::string token;
    while (std::getline(str_stream, token, delim)) {
        T value;
        std::istringstream token_stream(token);
        token_stream >> value;
        values.push_back(value);
    }
    return values;
 }
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
    std::vector<std::string> str_values;
@ -140,10 +126,11 @@ static std::string get_gpu_info() {
 }
 // command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
 static const char * output_format_str(output_formats format) {
    switch (format) {
        case NONE:     return "none";
        case CSV:      return "csv";
        case JSON:     return "json";
        case MARKDOWN: return "md";
@ -152,6 +139,23 @@ static const char * output_format_str(output_formats format) {
    }
 }
 static bool output_format_from_str(const std::string & s, output_formats & format) {
    if (s == "none") {
        format = NONE;
    } else if (s == "csv") {
        format = CSV;
    } else if (s == "json") {
        format = JSON;
    } else if (s == "md") {
        format = MARKDOWN;
    } else if (s == "sql") {
        format = SQL;
    } else {
        return false;
    }
    return true;
 }
 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
@ -190,31 +194,33 @@ struct cmd_params {
    int reps;
    bool verbose;
    output_formats output_format;
    output_formats output_format_stderr;
 };
 static const cmd_params cmd_params_defaults = {
-    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt      */ {512},
+    /* n_prompt             */ {512},
-    /* n_gen         */ {128},
+    /* n_gen                */ {128},
-    /* n_pg          */ {},
+    /* n_pg                 */ {},
-    /* n_batch       */ {2048},
+    /* n_batch              */ {2048},
-    /* n_ubatch      */ {512},
+    /* n_ubatch             */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
+    /* type_k               */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
+    /* type_v               */ {GGML_TYPE_F16},
-    /* n_threads     */ {cpu_get_num_math()},
+    /* n_threads            */ {cpu_get_num_math()},
-    /* n_gpu_layers  */ {99},
+    /* n_gpu_layers         */ {99},
-    /* rpc_servers   */ {""},
+    /* rpc_servers          */ {""},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
+    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu      */ {0},
+    /* main_gpu             */ {0},
-    /* no_kv_offload */ {false},
+    /* no_kv_offload        */ {false},
-    /* flash_attn    */ {false},
+    /* flash_attn           */ {false},
-    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap      */ {true},
+    /* use_mmap             */ {true},
-    /* embeddings    */ {false},
+    /* embeddings           */ {false},
-    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
-    /* reps          */ 5,
+    /* reps                 */ 5,
-    /* verbose       */ false,
+    /* verbose              */ false,
-    /* output_format */ MARKDOWN
+    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
 static void print_usage(int /* argc */, char ** argv) {
@ -243,6 +249,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@ -284,6 +291,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.verbose = cmd_params_defaults.verbose;
    params.output_format = cmd_params_defaults.output_format;
    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
    for (int i = 1; i < argc; i++) {
@ -300,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            params.model.insert(params.model.end(), p.begin(), p.end());
        } else if (arg == "-p" || arg == "--n-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
        } else if (arg == "-n" || arg == "--n-gen") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
        } else if (arg == "-pg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], ',');
+            auto p = string_split<std::string>(argv[i], ',');
            if (p.size() != 2) {
                invalid_param = true;
                break;
@ -332,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
        } else if (arg == "-ub" || arg == "--ubatch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@ -362,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@ -378,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-rpc" || arg == "--rpc") {
            if (++i >= argc) {
@ -398,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<llama_split_mode> modes;
            for (const auto & m : p) {
                llama_split_mode mode;
@ -420,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            params.main_gpu = split<int>(argv[i], split_delim);
+            params.main_gpu = string_split<int>(argv[i], split_delim);
        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "--numa") {
            if (++i >= argc) {
@ -444,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            for (auto ts : split<std::string>(argv[i], split_delim)) {
+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
                // split string by ; and /
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@ -493,18 +501,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            if (argv[i] == std::string("csv")) {
+            invalid_param = !output_format_from_str(argv[i], params.output_format);
-                params.output_format = CSV;
+        } else if (arg == "-oe" || arg == "--output-err") {
-            } else if (argv[i] == std::string("json")) {
+            if (++i >= argc) {
                params.output_format = JSON;
            } else if (argv[i] == std::string("md")) {
                params.output_format = MARKDOWN;
            } else if (argv[i] == std::string("sql")) {
                params.output_format = SQL;
            } else {
                invalid_param = true;
                break;
            }
            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else {
@ -706,7 +709,6 @@ struct test {
    static const std::string build_commit;
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
    static const bool vulkan;
    static const bool kompute;
    static const bool metal;
@ -795,9 +797,6 @@ struct test {
        if (cuda) {
            return GGML_CUDA_NAME;
        }
        if (opencl) {
            return "OpenCL";
        }
        if (vulkan) {
            return "Vulkan";
        }
@ -826,7 +825,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@ -852,7 +851,7 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
+        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
@ -881,7 +880,7 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
@ -910,7 +909,6 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
@ -1278,6 +1276,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
    (void) user_data;
 }
 static std::unique_ptr<printer> create_printer(output_formats format) {
    switch (format) {
        case NONE:
            return nullptr;
        case CSV:
            return std::unique_ptr<printer>(new csv_printer());
        case JSON:
            return std::unique_ptr<printer>(new json_printer());
        case MARKDOWN:
            return std::unique_ptr<printer>(new markdown_printer());
        case SQL:
            return std::unique_ptr<printer>(new sql_printer());
    }
    GGML_ASSERT(false);
 }
 int main(int argc, char ** argv) {
    // try to set locale for unicode characters in markdown
    setlocale(LC_CTYPE, ".UTF-8");
@ -1304,26 +1318,18 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);
    // initialize printer
-    std::unique_ptr<printer> p;
+    std::unique_ptr<printer> p = create_printer(params.output_format);
-    switch (params.output_format) {
+    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
-        case CSV:
+
-            p.reset(new csv_printer());
+    if (p) {
-            break;
+        p->fout = stdout;
-        case JSON:
+        p->print_header(params);
-            p.reset(new json_printer());
+    }
-            break;
+
-        case MARKDOWN:
+    if (p_err) {
-            p.reset(new markdown_printer());
+        p_err->fout = stderr;
-            break;
+        p_err->print_header(params);
        case SQL:
            p.reset(new sql_printer());
            break;
        default:
            assert(false);
            exit(1);
    }
    p->fout = stdout;
    p->print_header(params);
    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
@ -1381,7 +1387,15 @@ int main(int argc, char ** argv) {
            t.samples_ns.push_back(t_ns);
        }
-        p->print_test(t);
+        if (p) {
            p->print_test(t);
            fflush(p->fout);
        }
        if (p_err) {
            p_err->print_test(t);
            fflush(p_err->fout);
        }
        llama_print_timings(ctx);
@ -1390,7 +1404,13 @@ int main(int argc, char ** argv) {
    llama_free_model(lmodel);
-    p->print_footer();
+    if (p) {
        p->print_footer();
    }
    if (p_err) {
        p_err->print_footer();
    }
    llama_backend_free();
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -112,9 +112,12 @@ struct llava_context {
    struct llama_model * model = NULL;
 };
-static void show_additional_info(int /*argc*/, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    gpt_params_print_usage(argc, argv, params);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+
    LOG_TEE("\n example usage:\n");
    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, params);
        return 1;
    }
@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_params_print_usage(argc, argv, params);
+        print_usage(argc, argv, {});
        show_additional_info(argc, argv);
        return 1;
    }
    auto model = llava_init(&params);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -37,7 +37,8 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -14,8 +14,10 @@ int main(int argc, char ** argv){
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -16,6 +16,7 @@ int main(int argc, char ** argv){
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -15,6 +15,7 @@ int main(int argc, char ** argv){
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
 ### Considerations
-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
 ### Build llama.cpp and install to C:\LlamaCPP directory
 In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
 ```cmd
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/LlamaCPP
 ```
@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
 ```cmd
 cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/MyLlamaApp
 ```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```
 #### Windows:
 ```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
 ```
 ## Common Options
@ -69,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
@ -80,11 +79,10 @@ The `main` program provides several ways to interact with the LLaMA models using
 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
 -   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
 -   `--random-prompt`: Start with a randomized prompt.
 ## Interaction
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
 In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
@ -92,7 +90,6 @@ In interactive mode, users can participate in text generation by injecting their
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
 -   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@ -121,16 +118,6 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
 ./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 ### Instruction Mode
 Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
 -   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
 Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
 ## Context Management
 During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
    g_params = &params;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
    llama_sampling_params & sparams = params.sparams;
 #ifndef LOG_DISABLE_LOGS
@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_inp;
-    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
+    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
        if (params.chatml) {
            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
        }
        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
    } else {
        LOG("use session tokens\n");
@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
    }
    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
        params.n_keep = (int)embd_inp.size();
    } else {
        params.n_keep += add_bos; // always keep the BOS token
    }
-    // prefix & suffix for instruct mode
+    if (params.conversation) {
    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);
    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
    // chatml prefix & suffix
    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
    LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
        params.interactive_first = true;
        params.antiprompt.emplace_back("### Instruction:\n\n");
    }
    // similar for chatml mode
    else if (params.chatml) {
        params.interactive_first = true;
        params.antiprompt.emplace_back("<|im_start|>user\n");
    }
    else if (params.conversation) {
        params.interactive_first = true;
    }
@ -823,15 +795,13 @@ int main(int argc, char ** argv) {
                    is_interacting = true;
                    printf("\n");
                } else if (params.instruct || params.chatml) {
                    is_interacting = true;
                }
            }
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");
-                if (params.conversation || params.instruct || params.chatml) {
+                if (params.conversation) {
                    printf("\n> ");
                }
@ -874,24 +844,12 @@ int main(int argc, char ** argv) {
                    const size_t original_size = embd_inp.size();
                    // instruct mode: insert instruction prefix
                    if (params.instruct && !is_antiprompt) {
                        LOG("inserting instruction prefix\n");
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
                    // chatml mode: insert user chat prefix
                    if (params.chatml && !is_antiprompt) {
                        LOG("inserting chatml prefix\n");
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
                    }
                    if (params.escape) {
                        string_process_escapes(buffer);
                    }
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -900,17 +858,6 @@ int main(int argc, char ** argv) {
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
                    // instruct mode: insert response suffix
                    if (params.instruct) {
                        LOG("inserting instruction suffix\n");
                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                    }
                    // chatml mode: insert assistant chat suffix
                    if (params.chatml) {
                        LOG("inserting chatml suffix\n");
                        embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
                    }
                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
@ -935,7 +882,7 @@ int main(int argc, char ** argv) {
        }
        // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
    gpt_params params;
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@ -8,5 +8,5 @@ See the following PRs for more info:
 ### Usage
 ```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -6,46 +6,32 @@
 #include <string>
 #include <vector>
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (argc == 1 || argv[1][0] == '-') {
+    params.n_junk = 250;
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
+    params.n_keep = 32;
-        return 1 ;
+    params.i_pos  = -1;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
-    int seed = -1;
+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
-    int n_junk = 250; // number of times to repeat the junk text
+    int n_junk = params.n_junk;
-    int n_keep = 32;  // number of tokens in the prompt prefix
+    int n_keep = params.n_keep;
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
+    int n_grp  = params.grp_attn_n;
-    int i_pos  = -1;  // position of the passkey in the junk text
+    int i_pos  = params.i_pos;
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        n_junk = std::stoi(argv[2]);
    }
    if (argc >= 4) {
        n_grp = std::stoi(argv[3]);
    }
    if (argc >= 5) {
        i_pos = std::stoi(argv[4]);
    }
    if (argc >= 6) {
        seed = std::stoi(argv[5]);
    }
    if (seed == -1) {
        seed = time(NULL);
    }
    srand(seed);
    if (i_pos == -1) {
        i_pos = rand() % n_junk;
@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
    // initialize the model
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    model_params.n_gpu_layers = 99; // offload all layers to the GPU
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
    // initialize the context
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
-    ctx_params.seed    = seed;
+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_batch = 512;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
    int n_past = 0;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1032,7 +1032,7 @@ struct winogrande_entry {
    std::vector<llama_token> seq_tokens[2];
 };
-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
    std::vector<winogrande_entry> result;
    std::istringstream in(prompt);
    std::string line;
@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
    gpt_params params;
    params.n_ctx = 512;
    params.logits_all = true;
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
    params.logits_all = true;
    const int32_t n_ctx = params.n_ctx;
    if (n_ctx <= 0) {
@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) {
    }
    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@ -624,7 +624,7 @@ string ::= "\"" (
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" ws
 ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 integer ::= [0-9]+"""
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@ -47,7 +47,7 @@ echo PASS
 echo
 # 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo
@ -57,7 +57,7 @@ echo PASS
 echo
 # 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
 echo PASS
 echo
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -4,72 +4,12 @@
 #include <algorithm>
 #include <fstream>
-struct retrieval_params {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    std::vector<std::string> context_files; // context files to embed
+    gpt_params_print_usage(argc, argv, params);
    int32_t chunk_size            = 64;     // chunk size for context embedding
    std::string chunk_separator   = "\n";   // chunk separator for context embedding
 };
-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
+    LOG_TEE("\nexample usage:\n");
-    gpt_params_print_usage(argc, argv, gpt_params);
+    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    printf("retrieval options:\n");
+    LOG_TEE("\n");
    printf("  --context-file FNAME  file containing context to embed.\n");
    printf("                        specify multiple files by providing --context-file option multiple times.\n");
    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
    printf("  --chunk-separator STRING\n");
    printf("                        string to separate chunks (default: \"\\n\")\n");
    printf("\n");
 }
 static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
    int i = 1;
    std::string arg;
    while (i < argc) {
        arg = argv[i];
        bool invalid_gpt_param = false;
        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
            if (invalid_gpt_param) {
                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
                exit(1);
            }
            // option was parsed by gpt_params_find_arg
        } else if (arg == "--context-file") {
            if (++i >= argc) {
                fprintf(stderr, "error: missing argument for --context-file\n");
                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
                exit(1);
            }
            std::ifstream file(argv[i]);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
                exit(1);
            }
            // store the external file name in params
            retrieval_params.context_files.push_back(argv[i]);
        } else if (arg == "--chunk-size") {
            if (++i >= argc) {
                fprintf(stderr, "error: missing argument for --chunk-size\n");
                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
                exit(1);
            }
            retrieval_params.chunk_size = std::stoi(argv[i]);
        } else if (arg == "--chunk-separator") {
            if (++i >= argc) {
                fprintf(stderr, "error: missing argument for --chunk-separator\n");
                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
                exit(1);
            }
            retrieval_params.chunk_separator = argv[i];
        } else {
            // unknown argument
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
            exit(1);
        }
        i++;
    }
 }
 struct chunk {
@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;
    retrieval_params retrieval_params;
-    retrieval_params_parse(argc, argv, params, retrieval_params);
+    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    // For BERT models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;
    params.embedding = true;
-    if (retrieval_params.chunk_size <= 0) {
+    if (params.chunk_size <= 0) {
        fprintf(stderr, "chunk_size must be positive\n");
        return 1;
    }
-    if (retrieval_params.context_files.empty()) {
+    if (params.context_files.empty()) {
        fprintf(stderr, "context_files must be specified\n");
        return 1;
    }
    params.embedding = true;
    print_build_info();
    printf("processing files:\n");
-    for (auto & context_file : retrieval_params.context_files) {
+    for (auto & context_file : params.context_files) {
        printf("%s\n", context_file.c_str());
    }
    std::vector<chunk> chunks;
-    for (auto & context_file : retrieval_params.context_files) {
+    for (auto & context_file : params.context_files) {
-        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
    }
    printf("Number of chunks: %ld\n", chunks.size());
@ -242,7 +184,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        // add eos if not present
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+        if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
            inp.push_back(llama_token_eos(model));
        }
        chunk.tokens = inp;
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -6,10 +6,6 @@
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
    }
 #elif GGML_USE_SYCL
    fprintf(stderr, "%s: using SYCL backend\n", __func__);
    backend = ggml_backend_sycl_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
    params.prompt = "The quick brown fox";
    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -279,7 +279,7 @@ node index.js
    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch.  Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@ -416,7 +416,7 @@
          message = html`<${Probabilities} data=${data} />`
        } else {
          const text = isArrayMessage ?
-            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data.map(msg => msg.content).join('') :
            data;
          message = isCompletionMode ?
            text :
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -2,57 +2,26 @@
 const SPACE_RULE = '" "?';
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  if (minItems === 0 && maxItems === 1) {
    return `${itemRule}?`;
  }
  const separatorRule = opts.separatorRule ?? '';
  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
  if (separatorRule === '') {
-    if (minItems === 0 && maxItems === 1) {
+    if (minItems === 1 && maxItems === undefined) {
      return `${itemRule}?`;
    } else if (minItems === 1 && maxItems === undefined) {
      return `${itemRule}+`;
-    }
+    } else if (minItems === 0 && maxItems === undefined) {
-  }
+      return `${itemRule}*`;
  let result = '';
  if (minItems > 0) {
    if (itemRuleIsLiteral && separatorRule === '') {
      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
    } else {
-      result = Array.from({ length: minItems }, () => itemRule)
+      return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
    }
  }
-  const optRepetitions = (upToN, prefixWithSep=false) => {
+  const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
-    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
+  return minItems === 0 ? `(${result})?` : result;
    if (upToN === 0) {
      return '';
    } else if (upToN === 1) {
      return `(${content})?`;
    } else if (separatorRule !== '' && !prefixWithSep) {
      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
    } else {
      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
    }
  };
  if (minItems > 0 && maxItems !== minItems) {
    result += ' ';
  }
  if (maxItems !== undefined) {
    result += optRepetitions(maxItems - minItems, minItems > 0);
  } else {
    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
    if (minItems === 0 && separatorRule !== '') {
      result = `(${itemRule} ${itemOperator}*)?`;
    } else {
      result += `${itemOperator}*`;
    }
  }
  return result;
 }
 class BuiltinRule {
@ -62,27 +31,25 @@ class BuiltinRule {
  }
 }
 const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
 const PRIMITIVE_RULES = {
  boolean        : new BuiltinRule('("true" | "false") space', []),
-  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
+  'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
-  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+  'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
+  uuid           : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
-  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
  null           : new BuiltinRule('"null" space', []),
 };
 // TODO: support "uri", "email" string formats
 const STRING_FORMAT_RULES = {
-  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'date'            : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin
 // chat template utils
 //
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 inline bool verify_custom_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
    return res >= 0;
 }
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
    size_t alloc_size = 0;
@ -260,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
    return i;
 }
 static size_t common_part(const std::string & a, const std::string & b) {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
    return i;
 }
 static bool ends_with(const std::string & str, const std::string & suffix) {
    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 ```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
 ...
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -6,28 +6,27 @@
 #include <string>
 #include <vector>
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (argc == 1 || argv[1][0] == '-') {
+    params.prompt = "Hello my name is";
-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
+    params.n_predict = 32;
        return 1 ;
    }
-    if (argc >= 2) {
+    if (!gpt_params_parse(argc, argv, params)) {
-        params.model = argv[1];
+        print_usage(argc, argv, params);
-    }
+        return 1;
    if (argc >= 3) {
        params.prompt = argv[2];
    }
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
    // total length of the sequence including the prompt
-    const int n_len = 32;
+    const int n_predict = params.n_predict;
    // init LLM
@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
    // initialize the model
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
    // initialize the context
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
    ctx_params.seed  = 1234;
    ctx_params.n_ctx = 2048;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
+        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
        return 1;
    }
@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
    const auto t_main_start = ggml_time_us();
-    while (n_cur <= n_len) {
+    while (n_cur <= n_predict) {
        // sample the next token
        {
            auto   n_vocab = llama_n_vocab(model);
@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
            // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                LOG_TEE("\n");
                break;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -27,7 +27,8 @@ struct seq_draft {
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
        return 1;
    }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -302,7 +302,7 @@ static struct ggml_tensor * llama_build_train_graphs(
        const int rope_mode = 0;
        return ggml_rope_ext(
-            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1716948383,
+        "lastModified": 1717786204,
-        "narHash": "sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs=",
+        "narHash": "sha256-4q0s6m0GUcN7q+Y2DqD27iLvbcd1G50T2lv08kKxkSI=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "ad57eef4ef0659193044870c731987a6df5cf56b",
+        "rev": "051f920625ab5aabe37c920346e3e69d7d34400e",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -159,7 +159,6 @@
                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
                mpi-cpu = config.packages.default.override { useMpi = true; };
--- a/ggml-common.h
+++ b/ggml-common.h
@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
 #define QI1_S (QK_K / (4*QR1_S))
 #define QR1_S 8
 #define QI1_M (QK_K / (4*QR1_M))
 #define QR1_M 8
 #define QI4_NL (QK4_NL / (4*QR4_NL))
 #define QR4_NL 2
 #define QI4_XS (QK_K / (4*QR4_XS))
 #define QR4_XS 8
 #define QI3_S (QK_K / (4*QR3_S))
 #define QR3_S 8
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 #define QK4_0 32
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
 // cuda split buffer
-static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
+static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-    int64_t min_compute_capability = INT_MAX;
+    int64_t row_rounding = 0;
    int64_t max_compute_capability = INT_MIN;
    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+        if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
+            continue;
                min_compute_capability = ggml_cuda_info().devices[id].cc;
            }
            if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
                max_compute_capability = ggml_cuda_info().devices[id].cc;
            }
        }
    }
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+        const int cc = ggml_cuda_info().devices[id].cc;
-    switch(type) {
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
        case GGML_TYPE_Q3_K:
            return min_compute_capability < CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        default:
            GGML_ASSERT(false);
    }
-#else
+    return row_rounding;
    switch(type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q6_K:
            return 64;
        default:
            GGML_ASSERT(false);
    }
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }
 static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
    const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+    const int64_t rounding = get_row_rounding(tensor_split);
    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
    *row_low -= *row_low % rounding;
@ -1413,10 +1347,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
    GGML_UNUSED(main_device);
 }
 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
 #if !defined(GGML_USE_HIPBLAS)
    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
    cudaMemcpy3DPeerParms p = {};
    p.dstDevice = dstDevice;
    p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
    p.srcDevice = srcDevice;
    p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
    p.extent = make_cudaExtent(width, height, 1);
    return cudaMemcpy3DPeerAsync(&p, stream);
 #else
    // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
    GGML_UNUSED(dstDevice);
    GGML_UNUSED(srcDevice);
    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
 #endif // !defined(GGML_USE_HIPBLAS)
 }
 static void ggml_cuda_op_mul_mat(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
-    const bool convert_src1_to_q8_1) {
+    quantize_cuda_t quantize_src1) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -1473,7 +1427,9 @@ static void ggml_cuda_op_mul_mat(
    }
    struct dev_data {
-        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
+        int cc;
        ggml_cuda_pool_alloc<char>   src0_dd_alloc;
        ggml_cuda_pool_alloc<float> src1_ddf_alloc;
        ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
        ggml_cuda_pool_alloc<float>   dst_dd_alloc;
@ -1492,6 +1448,8 @@ static void ggml_cuda_op_mul_mat(
    int used_devices = 0;
    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
        dev[id].cc = ggml_cuda_info().devices[id].cc;
        // by default, use all rows
        dev[id].row_low  = 0;
        dev[id].row_high = ne01;
@ -1499,7 +1457,7 @@ static void ggml_cuda_op_mul_mat(
        // for multi GPU, get the row boundaries from tensor split
        // and round to mul_mat_q tile sizes
        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
+            const int64_t rounding = get_row_rounding(tensor_split);
            if (id != 0) {
                dev[id].row_low  = ne01*tensor_split[id];
@ -1542,11 +1500,15 @@ static void ggml_cuda_op_mul_mat(
            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
        }
-        if (convert_src1_to_q8_1) {
+        if (quantize_src1) {
-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+            size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
                src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
            }
            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
                CUDA_CHECK(cudaGetLastError());
            }
        }
@ -1592,7 +1554,12 @@ static void ggml_cuda_op_mul_mat(
                const int64_t i03 = i0 / ne12;
                const int64_t i02 = i0 % ne12;
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
                if (quantize_src1 == quantize_mmq_q8_1_cuda) {
                    src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
                } else {
                    src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
                }
                // for split tensors the data begins at i0 == i0_offset_low
                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@ -1609,10 +1576,17 @@ static void ggml_cuda_op_mul_mat(
                // copy src0, src1 to device if necessary
                if (src1_is_contiguous) {
                    if (id != ctx.device) {
-                        if (convert_src1_to_q8_1) {
+                        if (quantize_src1) {
                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
+                            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
+                                const size_t pitch = ne11*sizeof(block_q8_1_mmq);
                                const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
                                const size_t height = src1_padded_col_size/(4*QK8_1);
                                CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
                            } else {
                                CUDA_CHECK(cudaMemcpyPeerAsync(
                                    src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
                            }
                        } else {
                            float * src1_ddf_i_source = (float *) src1->data;
                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@ -1627,8 +1601,8 @@ static void ggml_cuda_op_mul_mat(
                    GGML_ASSERT(false);
                }
-                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
+                if (quantize_src1 && !src1_is_contiguous) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
                    CUDA_CHECK(cudaGetLastError());
                }
@ -1653,22 +1627,8 @@ static void ggml_cuda_op_mul_mat(
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
-#if !defined(GGML_USE_HIPBLAS)
+                        CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
-                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
+                            dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
                        cudaMemcpy3DPeerParms p = {};
                        p.dstDevice = ctx.device;
                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
                        p.srcDevice = id;
                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
 #else
                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
                                                        dst_dd_i, row_diff*sizeof(float),
                                                        row_diff*sizeof(float), src1_ncols,
                                                        cudaMemcpyDeviceToDevice, stream));
 #endif
                    } else {
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@ -2007,13 +1967,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
    } else {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
    }
 }
@ -2702,10 +2662,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        if (cuda_graph_update_required) {
            // Extract nodes from graph
-            if (cuda_ctx->cuda_graph->num_nodes == 0) {
+            // First call with null argument gets number of nodes in graph
-                // First call with null argument gets number of nodes in graph
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
                CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
            }
            // Subsequent call with non-null argument gets nodes
            cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
            cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -160,7 +160,7 @@
 #endif
 #define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
-#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
+#define  MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@ -484,6 +484,161 @@ static __device__ __forceinline__ float get_alibi_slope(
    return powf(base, exph);
 }
 template <ggml_type type>
 struct ggml_cuda_type_traits;
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_F16> {
    static constexpr int qk = 1;
    static constexpr int qr = 1;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
    static constexpr int qk = QK4_0;
    static constexpr int qr = QR4_0;
    static constexpr int qi = QI4_0;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
    static constexpr int qk = QK4_1;
    static constexpr int qr = QR4_1;
    static constexpr int qi = QI4_1;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
    static constexpr int qk = QK5_0;
    static constexpr int qr = QR5_0;
    static constexpr int qi = QI5_0;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
    static constexpr int qk = QK5_1;
    static constexpr int qr = QR5_1;
    static constexpr int qi = QI5_1;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
    static constexpr int qk = QK8_0;
    static constexpr int qr = QR8_0;
    static constexpr int qi = QI8_0;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR2_K;
    static constexpr int qi = QI2_K;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR3_K;
    static constexpr int qi = QI3_K;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR4_K;
    static constexpr int qi = QI4_K;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR5_K;
    static constexpr int qi = QI5_K;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR6_K;
    static constexpr int qi = QI6_K;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR2_XXS;
    static constexpr int qi = QI2_XXS;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR2_XS;
    static constexpr int qi = QI2_XS;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR2_S;
    static constexpr int qi = QI2_S;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR3_XXS;
    static constexpr int qi = QI3_XXS;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR1_S;
    static constexpr int qi = QI1_S;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR1_M;
    static constexpr int qi = QI1_M;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
    static constexpr int qk = QK4_NL;
    static constexpr int qr = QR4_NL;
    static constexpr int qi = QI4_NL;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR4_XS;
    static constexpr int qi = QI4_XS;
 };
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
    static constexpr int qk = QK_K;
    static constexpr int qr = QR3_S;
    static constexpr int qi = QI3_S;
 };
 static int get_mmq_x_max_host(const int cc) {
 #ifdef CUDA_USE_TENSOR_CORES
    return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
 #else
    return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
 #endif // CUDA_USE_TENSOR_CORES
 }
 // Round rows to this value for --split-mode row:
 static int get_mmq_y_host(const int cc, const int mmq_x) {
    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
 }
 //////////////////////
 struct ggml_cuda_device_info {
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@ -422,10 +422,22 @@ static __device__ void convert_f16(const void * vx, const int64_t ib, const int
    v.y = x[ib + iqs + 1];
 }
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
    return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
        type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
        type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
        type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
        type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
        type == GGML_TYPE_F16 ? convert_f16 :
        nullptr;
 }
 template <ggml_type type>
 static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    // qk = quantized weights per x block
+    constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
-    // qr = number of quantized weights per data value in x block
+    constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
    constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
    if (row >= nrows) {
@ -493,7 +505,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@ -502,7 +514,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+    dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@ -511,7 +523,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@ -520,7 +532,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+    dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@ -529,7 +541,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y,
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@ -580,7 +592,7 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<1, 1, convert_f16>
+    dequantize_mul_mat_vec<GGML_TYPE_F16>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@ -1,9 +1,47 @@
 #include "mmvq.cuh"
 #include "vecdotq.cuh"
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
-template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
    return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
        type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
        type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
        type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
        type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
        type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
        type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
        type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
        type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
        type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
        type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
        type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
        type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
        type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
        type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
        type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
        type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
        type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
        type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
        nullptr;
 }
 static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
    return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
        type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
        type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
        type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
        type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
        type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
        type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
        type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
        type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
        type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
        type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
        1;
 }
 template <ggml_type type, int ncols_y>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
@ -12,6 +50,12 @@ static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
    constexpr int vdr = get_vdr_mmvq(type);
    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
@ -29,7 +73,6 @@ static __global__ void mul_mat_vec_q(
 // partial sum for each thread
    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;
    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@ -42,8 +85,7 @@ static __global__ void mul_mat_vec_q(
        for (int j = 0; j < ncols_y; ++j) {
 #pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(
+                tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
            }
        }
    }
@ -81,12 +123,12 @@ static __global__ void mul_mat_vec_q(
    }
 }
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
+template <ggml_type type>
 static void mul_mat_vec_q_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    GGML_ASSERT(ncols_x % qk == 0);
+    GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
    int id = ggml_cuda_get_device();
@ -124,36 +166,28 @@ static void mul_mat_vec_q_cuda(
    switch (ncols_y) {
        case 1:
-            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 2:
-            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 3:
-            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 4:
-            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 5:
-            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 6:
-            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 7:
-            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 8:
-            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        default:
            GGML_ASSERT(false);
@ -165,152 +199,133 @@ static void mul_mat_vec_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q4_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q5_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q5_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q8_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q2_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q3_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q4_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q5_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_q6_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq2_xxs_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq2_xs_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq2_s_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq3_xxs_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq1_s_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq1_m_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq4_nl_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq4_xs_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 static void mul_mat_vec_iq3_s_q8_1_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-    mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 void ggml_cuda_op_mul_mat_vec_q(
--- a/ggml-cuda/quantize.cu
+++ b/ggml-cuda/quantize.cu
@ -1,22 +1,23 @@
 #include "quantize.cuh"
 #include <cstdint>
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
-    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
-    if (ix >= kx_padded) {
+    if (ix0 >= kx0_padded) {
        return;
    }
-    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
+    const int64_t ix1 = blockIdx.y;
-    const int64_t i_padded = (int64_t)iy*kx_padded + ix;
+    const int64_t i_padded = ix1*kx0_padded + ix0;
    block_q8_1 * y = (block_q8_1 *) vy;
    const int64_t ib = i_padded / QK8_1; // block index
    const int64_t iqs = i_padded % QK8_1; // quant index
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
    float amax = fabsf(xi);
    float sum = xi;
@ -36,10 +37,76 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
+template <bool need_sum>
-    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+static __global__ void quantize_mmq_q8_1(
-    const dim3 num_blocks(block_num_x, ky, 1);
+    const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
-    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
+
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
    if (ix0 >= kx0_padded) {
        return;
    }
    const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
    const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
    const int64_t ib  = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y;              // block index in channel
    const int64_t iqs = ix0 % (4*QK8_1);                                       // quant index in block
    const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
    float amax = fabsf(xi);
    amax = warp_reduce_max(amax);
    float sum;
    if (need_sum) {
        sum = warp_reduce_sum(xi);
    }
    const float d = amax / 127;
    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
    y[ib].qs[iqs] = q;
    if (iqs % QK8_1 != 0) {
        return;
    }
    if (need_sum) {
        y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
    } else {
        ((float *) y[ib].ds)[iqs/QK8_1] = d;
    }
 }
 void quantize_row_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
    GGML_ASSERT(kx0_padded % QK8_1 == 0);
    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
    const dim3 num_blocks(block_num_x, kx1*channels, 1);
    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
    GGML_UNUSED(type_x);
 }
 void quantize_mmq_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
    const dim3 num_blocks(block_num_x, kx1, channels);
    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
    if (mmq_need_sum(type_x)) {
        quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
    } else {
        quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
    }
 }
--- a/ggml-cuda/quantize.cuh
+++ b/ggml-cuda/quantize.cuh
@ -1,5 +1,20 @@
 #pragma once
 #include "common.cuh"
 #include "mmq.cuh"
 #include <cstdint>
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
+typedef void (*quantize_cuda_t)(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);
 void quantize_row_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);
 void quantize_mmq_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@ -1,7 +1,7 @@
 #include "rope.cuh"
 struct rope_corr_dims {
-    float v[4];
+    float v[2];
 };
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
@ -13,8 +13,7 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static __device__ void rope_yarn(
    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
+    float * cos_theta, float * sin_theta) {
 ) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
@ -29,27 +28,38 @@ static __device__ void rope_yarn(
    *sin_theta = sinf(theta) * mscale;
 }
-// rope == RoPE == rotary positional embedding
+template<typename T, bool has_ff>
-template<typename T, bool has_pos>
+static __global__ void rope_norm(
-static __global__ void rope(
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-    if (col >= ncols) {
+    if (i0 >= ne0) {
        return;
    }
    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i = row*ncols + col;
+
    if (i0 >= n_dims) {
        const int i = row*ne0 + i0;
        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
        return;
    }
    const int i  = row*ne0 + i0;
    const int i2 = row/p_delta_rows;
-    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
    const float theta_base = p*powf(freq_base, -float(col)/ncols);
-    float cos_theta, sin_theta;
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
    float cos_theta;
    float sin_theta;
    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + 1];
@ -58,23 +68,20 @@ static __global__ void rope(
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
-template<typename T, bool has_pos, bool has_freq_facs>
+template<typename T, bool has_ff>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
-) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-    if (col >= ncols) {
+    if (i0 >= ne0) {
        return;
    }
    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int ib = col / n_dims;
    const int ic = col % n_dims;
-    if (ib > 0) {
+    if (i0 >= n_dims) {
-        const int i = row*ncols + ib*n_dims + ic;
+        const int i = row*ne0 + i0;
        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
@ -82,16 +89,17 @@ static __global__ void rope_neox(
        return;
    }
-    const int i  = row*ncols + ib*n_dims + ic/2;
+    const int i  = row*ne0 + i0/2;
    const int i2 = row/p_delta_rows;
-    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
    const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
-    const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
-    float cos_theta, sin_theta;
+    float cos_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    float sin_theta;
    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + n_dims/2];
@ -100,144 +108,81 @@ static __global__ void rope_neox(
    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 static __global__ void rope_glm_f32(
    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
    int n_ctx
 ) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;
    if (col >= half_n_dims) {
        return;
    }
    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;
    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
     // FIXME: this is likely wrong
    const int p = pos != nullptr ? pos[i2] : 0;
    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + half_n_dims];
    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);
    const float x2 = x[i + half_n_dims * 2];
    const float x3 = x[i + half_n_dims * 3];
    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
 }
 template<typename T>
-static void rope_cuda(
+static void rope_norm_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-) {
+    GGML_ASSERT(ne0 % 2 == 0);
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
+    const dim3 block_nums(nr, n_blocks_x, 1);
-    if (pos == nullptr) {
+
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+
-        );
+    if (freq_factors == nullptr) {
        rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, freq_factors
                );
    } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
+        rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-        );
+                theta_scale, freq_factors
                );
    }
 }
 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-) {
+    GGML_ASSERT(ne0 % 2 == 0);
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
+    const dim3 block_nums(nr, n_blocks_x, 1);
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    if (pos == nullptr) {
+    if (freq_factors == nullptr) {
-        if (freq_factors == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, freq_factors
                );
        }
    } else {
-        if (freq_factors == nullptr) {
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                theta_scale, freq_factors
                );
        }
    }
 }
-static void rope_glm_f32_cuda(
+static void rope_norm_cuda_f16(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, int n_ctx, cudaStream_t stream
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-) {
+
-    GGML_ASSERT(ncols % 4 == 0);
+    rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
 }
-static void rope_cuda_f16(
+static void rope_norm_cuda_f32(
-    const half * x, half * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-    rope_cuda<half>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
+    rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 static void rope_cuda_f32(
    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
    rope_cuda<float>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
 }
 static void rope_neox_cuda_f16(
-    const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+    rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 static void rope_neox_cuda_f32(
-    const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
 ) {
-    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+    rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -258,16 +203,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
-    const int64_t nrows = ggml_nrows(src0);
+    const int64_t nr = ggml_nrows(src0);
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    float freq_base;
    float freq_scale;
    float ext_factor;
    float attn_factor;
    float beta_fast;
    float beta_slow;
    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
@ -275,38 +226,28 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
    const float * freq_factors = nullptr;
    const int32_t * pos = nullptr;
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
-    pos = (const int32_t *) src1_d;
+    const int32_t * pos = (const int32_t *) src1_d;
-    if (is_neox) {
+    const float * freq_factors = nullptr;
-        if (src2 != nullptr) {
+    if (src2 != nullptr) {
-            freq_factors = (const float *) src2->data;
+        freq_factors = (const float *) src2->data;
        }
    } else {
        GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
    }
    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
    // compute
-    if (is_glm) {
+    if (is_neox) {
        GGML_ASSERT(false);
        rope_glm_f32_cuda(src0_d, dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, stream);
    } else if (is_neox) {
        if (src0->type == GGML_TYPE_F32) {
            rope_neox_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, freq_factors, stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_neox_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, freq_factors, stream
            );
        } else {
@ -314,14 +255,14 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        }
    } else {
        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda_f32(
+            rope_norm_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+                attn_factor, corr_dims, freq_factors, stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda_f16(
+            rope_norm_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+                attn_factor, corr_dims, freq_factors, stream
            );
        } else {
            GGML_ASSERT(false);
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../fattn-vec-f16.cuh"
--- a/Show More
+++ b/Show More
`@ -1,4 +1,4 @@`
	`// This file has been autogenerated by generate-variants.py, do not edit manually.`	`// This file has been autogenerated by generate_cu_files.py, do not edit manually.`

	`#include "../fattn-vec-f16.cuh"`	`#include "../fattn-vec-f16.cuh"`