Merge branch 'master' into gg/flash-attn

2025-01-28 21:07:06 +01:00 · 2024-03-28 19:32:51 +02:00 · 2024-03-28 19:32:51 +02:00 · 3e318e764f
commit 3e318e764f
parent 57c03b78b6 5106ef482c
21 changed files with 1316 additions and 458 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -24,7 +24,7 @@
    useOpenCL
    useRocm
    useVulkan
-  ],
+  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
@ -67,10 +67,15 @@ let
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";

+  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
+
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
+  #
+  # TODO: Package up each Python script or service appropriately, by making
+  # them into "entrypoints"
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
@ -159,11 +164,6 @@ effectiveStdenv.mkDerivation (
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-
-      # TODO: Package up each Python script or service appropriately.
-      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
-      # we could make those *.py into setuptools' entrypoints
-      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';

    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
@ -244,8 +244,8 @@ effectiveStdenv.mkDerivation (
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
-      mv $out/bin/main $out/bin/llama
-      mv $out/bin/server $out/bin/llama-server
+      mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
+      mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -0,0 +1,280 @@
+# Benchmark
+name: Benchmark
+
+on:
+  workflow_dispatch:
+    inputs:
+      gpu-series:
+        description: 'Azure GPU series to run with'
+        required: true
+        type: choice
+        options:
+          - Standard_NC4as_T4_v3
+          - Standard_NC24ads_A100_v4
+          - Standard_NC80adis_H100_v5
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      duration:
+        description: 'Duration of the bench'
+        type: string
+        default: 10m
+
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+  schedule:
+    -  cron: '04 2 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  bench-server-baseline:
+    runs-on: Standard_NC4as_T4_v3
+    env:
+      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+      N_USERS: 8
+      DURATION: 10m
+    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install python env
+        id: pipenv
+        run: |
+          cd examples/server/bench
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+
+      - name: Prometheus
+        id: install_prometheus
+        run: |
+          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
+          tar xzf prometheus*.tar.gz --strip-components=1
+          ./prometheus --config.file=examples/server/bench/prometheus.yml &
+          while ! nc -z localhost 9090; do
+            sleep 0.1
+          done
+
+      - name: Install k6
+        id: k6_installation
+        run: |
+          cd examples/server/bench
+          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
+          tar xzf k6*.tar.gz --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          set -eux
+          mkdir build
+          cd build
+          cmake .. \
+              -DLLAMA_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DLLAMA_CUBLAS=ON \
+              -DCUDAToolkit_ROOT=/usr/local/cuda \
+              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+              -DCMAKE_CUDA_ARCHITECTURES=75 \
+              -DLLAMA_FATAL_WARNINGS=OFF \
+              -DLLAMA_ALL_WARNINGS=OFF \
+              -DCMAKE_BUILD_TYPE=Release;
+          cmake --build . --config Release -j $(nproc) --target server
+
+      - name: Download the dataset
+        id: download_dataset
+        run: |
+          cd examples/server/bench
+          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+      - name: Server bench
+        id: server_bench
+        run: |
+          set -eux
+
+          cd examples/server/bench
+          source venv/bin/activate
+          BENCH_K6_BIN_PATH=./k6 python bench.py \
+              --runner-label ${{ env.RUNNER_LABEL }} \
+              --name ${{ github.job }} \
+              --branch ${{ github.head_ref || github.ref_name }} \
+              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
+              --scenario script.js \
+              --duration ${{ github.event.inputs.duration || env.DURATION }} \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix /models \
+              --parallel ${{ env.N_USERS }} \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 16384 \
+              --n-prompts 1000 \
+              --max-prompt-tokens 1024 \
+              --max-tokens 2048
+
+          cat results.github.env >> $GITHUB_ENV
+
+          # Remove dataset as we do not want it in the artefact
+          rm ShareGPT_V3_unfiltered_cleaned_split.json
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          compression-level: 9
+          path: |
+            examples/server/bench/*.jpg
+            examples/server/bench/*.json
+            examples/server/bench/*.log
+
+      - name: Commit status
+        uses: Sibz/github-status-action@v1
+        continue-on-error: true # If not authorized on external repo
+        with:
+          authToken: ${{secrets.GITHUB_TOKEN}}
+          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
+          context: bench-server-baseline
+          description: |
+            ${{ env.BENCH_RESULTS }}
+          state: 'success'
+
+      - name: Upload benchmark images
+        uses: devicons/public-upload-to-imgur@v2.2.2
+        continue-on-error: true # Important as it looks unstable: 503
+        id: imgur_step
+        with:
+          client_id: ${{secrets.IMGUR_CLIENT_ID}}
+          path: |
+            examples/server/bench/prompt_tokens_seconds.jpg
+            examples/server/bench/predicted_tokens_seconds.jpg
+            examples/server/bench/kv_cache_usage_ratio.jpg
+            examples/server/bench/requests_processing.jpg
+
+      - name: Extract mermaid
+        id: set_mermaid
+        run: |
+          set -eux
+
+          cd examples/server/bench
+          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
+          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
+          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
+          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
+          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
+          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
+          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Extract image url
+        id: extract_image_url
+        continue-on-error: true
+        run: |
+          set -eux
+
+          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
+          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
+          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
+          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
+
+      - name: Comment PR
+        uses: mshick/add-pr-comment@v2
+        id: comment_pr
+        if: ${{ github.event.pull_request != '' }}
+        with:
+          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
+          message: |
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+            - ${{ env.BENCH_GRAPH_XLABEL }}
+
+            <details>
+
+            <summary>Time series</summary>
+
+            <p align="center">
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
+
+            <details>
+
+            <summary>More</summary>
+
+            ```mermaid
+            ${{ env.PROMPT_TOKENS_SECONDS }}
+            ```
+
+            </details>
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.PREDICTED_TOKENS_SECONDS }}
+            ```
+
+            </details>
+
+            </p>
+
+            <details>
+
+            <summary>Details</summary>
+
+            <p align="center">
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.KV_CACHE_USAGE_RATIO }}
+            ```
+
+            </details>
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.REQUESTS_PROCESSING }}
+            ```
+
+            </details>
+
+            </p>
+            </details>
+            </details>
--- a/1
+++ b/1
@ -556,6 +556,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

--- a/README-sycl.md
+++ b/README-sycl.md
@ -3,7 +3,7 @@
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
- [Intel GPU](#intel-gpu)
+- [Supported Devices](#supported-devices)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@ -14,17 +14,25 @@

 ## Background

-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.

-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
+- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

-To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+### Llama.cpp + SYCL
+This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

-The llama.cpp for SYCL is used to support Intel GPUs.
+The llama.cpp SYCL backend supports:
+- Intel GPUs.
+- Nvidia GPUs.

-For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+*Upcoming support: AMD GPUs*.
+
+When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64](README.md#intel-onemkl) approach.

 ## News

@ -51,9 +59,16 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 |Windows|Support|Windows 11|


-## Intel GPU
+## Supported devices

-### Verified
+### Intel GPUs
+
+The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
+```sh
+source /opt/intel/oneapi/setvars.sh
+```
+
+- **Tested devices**

 |Intel GPU| Status | Verified Model|
 |-|-|-|
@ -63,198 +78,229 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|

-Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
+*Notes:*

-### Memory
+- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.

-The memory is a limitation to run LLM on GPUs.
+- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.

-When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors:            buffer size =  3577.56 MiB`.
+- If the iGPU has less than 80  EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.

-For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
+### Nvidia GPUs
+The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)

-For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
+- **Tested devices**

-## Nvidia GPU
-
-### Verified
-
-|Intel GPU| Status | Verified Model|
+|Nvidia GPU| Status | Verified Model|
 |-|-|-|
-|Ampere Series| Support| A100|
+|Ampere Series| Support| A100, A4000|
+|Ampere Series *(Mobile)*| Support| RTX 40 Series|

-### oneMKL for CUDA
+*Notes:*
+  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.

-The current oneMKL release does not contain the oneMKL cuBlas backend.
-As a result for Nvidia GPU's oneMKL must be built from source.
+  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.

-```
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-mkdir build
-cd build
-cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
-ninja
-// Add paths as necessary
-```

 ## Docker
-
-Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
-
-### Build the image
-
-You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
-
-
+The docker build option is currently limited to *intel GPU* targets.
+### Build image
 ```sh
-# For F16:
-#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
-
-# Or, for F32:
-docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
-
-# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
+# Using FP16
+docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
 ```

-### Run
+*Notes*:
+
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
+
+You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
+
+### Run container

 ```sh
-# Firstly, find all the DRI cards:
+# First, find all the DRI cards
 ls -la /dev/dri
-# Then, pick the card that you want to use.
-
-# For example with "/dev/dri/card1"
+# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

+*Notes:*
+- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
+- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
+
 ## Linux

-### Setup Environment
+### I. Setup Environment

-1. Install Intel GPU driver.
+1. **Install GPU drivers**

-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+  - **Intel GPU**

-Note: for iGPU, please install the client GPU driver.
+Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).

-b. Add user to group: video, render.
+*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
+
+Once installed, add the user(s) to the `video` and `render` groups.

 ```sh
-sudo usermod -aG render username
-sudo usermod -aG video username
+sudo usermod -aG render $USER
+sudo usermod -aG video $USER
 ```

-Note: re-login to enable it.
+*Note*: logout/re-login for the changes to take effect.

-c. Check
+Verify installation through `clinfo`:

 ```sh
 sudo apt install clinfo
 sudo clinfo -l
 ```

-Output (example):
+Sample output:

-```
+```sh
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics

-
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```

-2. Install Intel® oneAPI Base toolkit.
+- **Nvidia GPU**

-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
+Installation can be verified by running the following:
+```sh
+nvidia-smi
+```
+Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
+```
+---------------------------------------------------------------------------------------+
+| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
+|-----------------------------------------+----------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+|                                         |                      |               MIG M. |
+|=========================================+======================+======================|
+|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
+| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
+|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
+```

-Recommend to install to default folder: **/opt/intel/oneapi**.

-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+2. **Install Intel® oneAPI Base toolkit**

-b. Check
+- **Base installation**
+
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+
+Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
+
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
+
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
+
+- **Adding support to Nvidia GPUs**
+
+**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+
+
+**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
-source /opt/intel/oneapi/setvars.sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+mkdir -p buildWithCublas && cd buildWithCublas
+cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+make
+```

+
+3. **Verify installation and environment**
+
+In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
+```sh
+source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```

-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
+- **Intel GPU**
+
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:

-Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
 ```

-2. Build locally:
+- **Nvidia GPU**

-Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
+[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
+[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
+```

+### II. Build llama.cpp
+
+#### Intel GPU
 ```sh
-mkdir -p build
-cd build
+# Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

-# For FP16:
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Build LLAMA with MKL BLAS acceleration for intel GPU
+mkdir -p build && cd build

-# Or, for FP32:
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# Option 2: Use FP32 by default
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# For Nvidia GPUs
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Build example/main only
-#cmake --build . --config Release --target main
-
-# Or, build all binary
-cmake --build . --config Release -v
-
-cd ..
 ```

-or
-
+#### Nvidia GPU
 ```sh
-./examples/sycl/build.sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+
+# Build LLAMA with Nvidia BLAS acceleration through SYCL
+mkdir -p build && cd build
+
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# Option 2: Use FP32 by default
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

-### Run
+### III. Run the inference

-1. Put model file to folder **models**
+1. Retrieve and prepare model

-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 2. Enable oneAPI running environment

-```
+```sh
 source /opt/intel/oneapi/setvars.sh
 ```

-3. List device ID
+3. List devices information

-Run without parameter:
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```sh
 ./build/bin/ls-sycl-device
-
-# or running the "main" executable and look at the output log:
-
-./build/bin/main
 ```
-
-Check the ID in startup log, like:
-
+A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@ -270,15 +316,15 @@ found 6 SYCL devices:

 |Attribute|Note|
 |-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+|compute capability 1.3|Level-zero driver/runtime, recommended |
+|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|

-4. Device selection and execution of llama.cpp
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device target specified by the user.
+- Multiple devices: Automatically select the devices with the same largest Max compute-units.

 |Device selection|Parameter|
 |-|-|
@ -303,74 +349,64 @@ or run by script:
 ```sh
 ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
-or run by script:
+
+Otherwise, you can run the script:

 ```sh
 ./examples/sycl/run_llama2.sh
 ```

-Note:
+*Notes:*

- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

-
-5. Verify the device ID in output
-
-Verify to see if the selected GPU is shown in the output, like:
-
-```
+```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```
+```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

-
 ## Windows

-### Setup Environment
+### I. Setup Environment

-1. Install Intel GPU driver.
+1. Install GPU driver

-Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

-Note: **The driver is mandatory for compute function**.
+2. Install Visual Studio

-2. Install Visual Studio.
+If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).

-Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
+3. Install Intel® oneAPI Base toolkit

-3. Install Intel® oneAPI Base toolkit.
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.

-Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
-
-Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

 b. Enable oneAPI running environment:

- In Search, input 'oneAPI'.
+- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.

-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
- In Run:
-
-In CMD:
+- On the command prompt, enable the runtime environment with the following:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-c. Check GPU
+c. Verify installation

-In oneAPI command line:
+In the oneAPI command line, run the following to print the available SYCL devices:

 ```
 sycl-ls
 ```

-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
+There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:

 Output (example):
 ```
@ -380,7 +416,7 @@ Output (example):
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```

-4. Install cmake & make
+4. Install build tools

 a. Download & install cmake for Windows: https://cmake.org/download/

@ -390,76 +426,53 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit

 - Extract `w64devkit` on your pc.

- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
+- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).

-### Build locally:
+### II. Build llama.cpp

-In oneAPI command line window:
+On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
 mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-::  for FP16
-::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

-::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
-::  build example/main only
-::  make main
-
-::  build all binary
-make -j
-cd ..
+make
 ```

-or
-
-```
+Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
+```sh
 .\examples\sycl\win-build-sycl.bat
 ```

-Note:
+*Notes:*

- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.

-### Run
+### III. Run the inference

-1. Put model file to folder **models**
+1. Retrieve and prepare model

-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 2. Enable oneAPI running environment

- In Search, input 'oneAPI'.
-
-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
- In Run:
-
-In CMD:
+On the oneAPI command line window, run the following and step into the llama.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-3. List device ID
+3. List devices information

-Run without parameter:
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```
 build\bin\ls-sycl-device.exe
-
-or
-
-build\bin\main.exe
 ```

-Check the ID in startup log, like:
-
+The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@ -480,7 +493,7 @@ found 6 SYCL devices:
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|


-4. Device selection and execution of llama.cpp
+4. Launch inference

 There are two device selection modes:

@ -505,7 +518,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be
 ```
 build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
-or run by script:
+Otherwise, run the following wrapper script:

 ```
 .\examples\sycl\win-run-llama2.bat
@ -513,19 +526,14 @@ or run by script:

 Note:

- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

-
-
-5. Verify the device ID in output
-
-Verify to see if the selected GPU is shown in the output, like:
-
-```
+```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```
+```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

@ -535,64 +543,54 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 |Name|Value|Function|
 |-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
-|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
-
-#### Running
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
+|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
+|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
+|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
+|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|

+#### Runtime

 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|

-## Known Issue
+## Known Issues

- Hang during startup
+- Hanging during startup

-  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.

-  Solution: add **--no-mmap** or **--mmap 0**.
+  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.

- Split-mode: [row] is not supported
-
-  It's on developing.
+- `Split-mode:[row]` is not supported.

 ## Q&A

-Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
-
-
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

-  Miss to enable oneAPI running environment.
+  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
+  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.

-  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+- General compiler error:

- In Windows, no result, not error.
+  - Remove build folder or try a clean-build.

-  Miss to enable oneAPI running environment.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

- Meet compile error.
+  Please double-check with `sudo sycl-ls`.

-  Remove folder **build** and try again.
-
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
-
-  Please run **sudo sycl-ls**.
-
-  If you see it in result, please add video/render group to your ID:
+  If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:

  ```
-  sudo usermod -aG render username
-  sudo usermod -aG video username
+  sudo usermod -aG render $USER
+  sudo usermod -aG video $USER
  ```
+  Otherwise, please double-check the GPU driver installation steps.

-  Then **relogin**.
-
-  If you do not see it, please check the installation GPU steps again.
+### **GitHub contribution**:
+Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

 ## Todo

--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

-from convert import HfVocab
+from convert import LlamaHfVocab


 ###### MODEL DEFINITIONS ######
@ -230,7 +230,7 @@ class Model(ABC):
    def _set_vocab_gpt2(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[bytearray] = []
+        tokens: list[str] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
@ -243,8 +243,7 @@ class Model(ABC):

        for i in range(vocab_size):
            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode('utf-8')
-                tokens.append(bytearray(pad_token))
+                tokens.append(f"[PAD{i}]")
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@ -266,7 +265,7 @@ class Model(ABC):
    def _set_vocab_qwen(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[bytearray] = []
+        tokens: list[str] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
@ -291,8 +290,7 @@ class Model(ABC):

        for i in range(vocab_size):
            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
+                tokens.append(f"[PAD{i}]")
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@ -372,12 +370,8 @@ class Model(ABC):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_hf(self):
-        path = self.dir_model
-        added_tokens_path = self.dir_model
-        vocab = HfVocab(
-            path, added_tokens_path if added_tokens_path.exists() else None
-        )
+    def _set_vocab_llama_hf(self):
+        vocab = LlamaHfVocab(self.dir_model)
        tokens = []
        scores = []
        toktypes = []
@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
        self.gguf_writer.add_file_type(self.ftype)

    def set_vocab(self):
-        self._set_vocab_hf()
+        self._set_vocab_llama_hf()

    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
        if n_kv_head is not None and n_head != n_kv_head:
@ -1700,11 +1694,8 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)

    def set_vocab(self):
-        path = self.dir_model
-        added_tokens_path = self.dir_model if self.dir_model.exists() else None
-
        # use huggingface vocab to get all tokens
-        vocab = HfVocab(path, added_tokens_path)
+        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        self.vocab_size = vocab.vocab_size
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -106,12 +106,12 @@ def main():
    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    print(tensor_map)
    for name in tensors.keys():
-        data = tensors[name]
+        data_torch = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
-        old_dtype = data.dtype
+        old_dtype = data_torch.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data.to(torch.float32).squeeze().numpy()
+        data = data_torch.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
--- a/convert.py
+++ b/convert.py
@ -16,13 +16,14 @@ import re
 import signal
 import struct
 import sys
+import textwrap
 import time
 import zipfile
-from abc import ABCMeta, abstractmethod
+from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable

 import numpy as np
 from sentencepiece import SentencePieceProcessor
@ -43,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8

+ADDED_TOKENS_FILE = 'added_tokens.json'
+FAST_TOKENIZER_FILE = 'tokenizer.json'
+
 #
 # data types
 #
@ -188,8 +192,10 @@ class Params:
            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            msg = """\
+                failed to guess 'n_layer'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))

        n_head = n_embd // 128 # guessed
        n_mult = 256           # guessed
@ -211,7 +217,8 @@ class Params:

    @staticmethod
    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
+            config = json.load(f)

        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")
@ -233,8 +240,10 @@ class Params:
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            msg = """\
+                failed to guess 'n_ctx'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))

        n_experts      = None
        n_experts_used = None
@ -265,7 +274,8 @@ class Params:
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
+            config = json.load(f)

        n_experts      = None
        n_experts_used = None
@ -331,47 +341,86 @@ class Params:
 # vocab
 #

-class BpeVocab:
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]
+
+
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"

-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        if isinstance(self.bpe_tokenizer.get('model'), dict):
-            self.vocab = self.bpe_tokenizer["model"]["vocab"]
-        else:
-            self.vocab = self.bpe_tokenizer
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-            if not tokenizer_json_file.is_file():
-                added_tokens = {}
-            else:
-                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
-                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer)
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}

-        vocab_size: int = len(self.vocab)
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids      = sorted(added_tokens.values())
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens   = fname_added_tokens

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -392,19 +441,25 @@ class BpeVocab:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class SentencePieceVocab:
+class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"

-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')

-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()

        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@ -414,18 +469,17 @@ class SentencePieceVocab:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")

        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_dict = added_tokens
+        self.added_tokens_dict  = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
+            text         = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)

            toktype = gguf.TokenType.NORMAL
@ -458,27 +512,42 @@ class SentencePieceVocab:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class HfVocab:
+class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"

-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
+    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
+        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        if ignore_nonllama:
+            pass  # workaround incorrect use of this class for WordPiece
+        elif (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
-                "To use HfVocab, please install the `transformers` package. "
+                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e

-        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
-            fname_tokenizer,
-            cache_dir=fname_tokenizer,
+            base_path,
+            cache_dir=base_path,
            local_files_only=True,
        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used

        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
@ -506,8 +575,7 @@ class HfVocab:
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)

-        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.fname_tokenizer = fname_tokenizer

    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
@ -559,18 +627,7 @@ class HfVocab:
        yield from self.added_tokens()

    def __repr__(self) -> str:
-        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class NoVocab:
-    tokenizer_model = "no_vocab"
-    name = "no_vocab"
-
-    def __repr__(self) -> str:
-        return "<NoVocab for a model without integrated vocabulary>"
-
-
-Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


 #
@ -588,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
            .reshape(weights.shape))


-class Tensor(metaclass=ABCMeta):
+class Tensor(ABC):
    data_type: DataType

    @abstractmethod
@ -610,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:


 class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray) -> None:
+    def __init__(self, ndarray: NDArray):
        assert isinstance(ndarray, np.ndarray)
        self.ndarray = ndarray
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
@ -689,7 +746,7 @@ class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
    format: Literal['ggml', 'torch', 'safetensors', 'none']
-    vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.
+    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.


 def merge_sharded(models: list[LazyModel]) -> LazyModel:
@ -698,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
    names = {name: None for model in models for name in model}

    def convert(name: str) -> LazyTensor:
-        lazy_tensors: list[LazyTensor] = [model[name] for model in models]
+        lazy_tensors = [model[name] for model in models]
        if len(lazy_tensors) == 1:
            # only one file; don't go through this procedure since there might
            # be quantized tensors
@ -719,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:

        def load() -> UnquantizedTensor:
            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            concatenated = np.concatenate(ndarrays, axis=axis)
            return UnquantizedTensor(concatenated)
        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@ -807,10 +864,10 @@ class LazyUnpickler(pickle.Unpickler):

        def load(offset: int, elm_count: int) -> NDArray:
            dtype = data_type.dtype
-            fp = self.zip_file.open(info)
-            fp.seek(offset * dtype.itemsize)
-            size = elm_count * dtype.itemsize
-            data = fp.read(size)
+            with self.zip_file.open(info) as fp:
+                fp.seek(offset * dtype.itemsize)
+                size = elm_count * dtype.itemsize
+                data = fp.read(size)
            assert len(data) == size
            return np.frombuffer(data, dtype)
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@ -831,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: dict[tuple[str, str], Any] = {
+    CLASSES = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -890,7 +947,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
 def must_read(fp: IO[bytes], length: int) -> bytes:
    ret = fp.read(length)
    if len(ret) < length:
-        raise Exception("unexpectedly reached end of file")
+        raise EOFError("unexpectedly reached end of file")
    return ret


@ -948,13 +1005,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result


-def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
+def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
    # Handle special case where the model's vocab size is not set
    if params.n_vocab == -1:
        raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
+            "The model's vocab size is set to -1 in params.json. Please update it manually."
+            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
        )
-    if isinstance(vocab, NoVocab):
+    if not isinstance(vocab, Vocab):
        return  # model has no vocab

    # Check for a vocab size mismatch
@ -979,11 +1037,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
    if vocab.vocab_size < params.n_vocab:
        msg += " Add the --pad-vocab option and try again."

-    raise Exception(msg)
+    raise ValueError(msg)


 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

    def add_meta_arch(self, params: Params) -> None:
@ -1034,8 +1092,6 @@ class OutputFile:
            self.gguf.add_file_type(params.ftype)

    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
-        assert not isinstance(vocab, NoVocab)
-
        tokens = []
        scores = []
        toktypes = []
@ -1135,7 +1191,7 @@ class OutputFile:

    @staticmethod
    def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
@ -1145,11 +1201,11 @@ class OutputFile:

        # meta data
        of.add_meta_arch(params)
-        if isinstance(vocab, NoVocab):
-            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
-        else:
+        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
            of.add_meta_special_vocab(svocab)
+        else:  # NoVocab
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)

        # tensor info
        for name, lazy_tensor in model.items():
@ -1176,7 +1232,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT

    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}

-    raise Exception(f"Unexpected combination of types: {name_to_type}")
+    raise ValueError(f"Unexpected combination of types: {name_to_type}")


 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1186,7 +1242,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM

 def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
-    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))

    tmp = model

@ -1213,8 +1269,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
            if skip_unknown:
                print(f"Unexpected tensor name: {name} - skipping")
                continue
-            else:
-                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
+            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")

        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
@ -1231,7 +1286,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
    the nth path in the model.
    '''
    # Support the following patterns:
-    patterns: list[tuple[str, str]] = [
+    patterns = [
        # - x.00.pth, x.01.pth, etc.
        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@ -1277,9 +1332,9 @@ def load_some_model(path: Path) -> ModelPlus:
            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
-            raise Exception(f"Can't find model in directory {path}")
+            raise FileNotFoundError(f"Can't find model in directory {path}")
        if len(files) > 1:
-            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
        path = files[0]

    paths = find_multifile_paths(path)
@ -1293,36 +1348,14 @@ def load_some_model(path: Path) -> ModelPlus:


 class VocabFactory:
-    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]

    def __init__(self, path: Path):
        self.path = path
-        self.file_paths = self._detect_files()
-        print(f"Found vocab files: {self.file_paths}")

-    def _detect_files(self) -> dict[str, Path | None]:
-        def locate(file: str) -> Path | None:
-            if (path := self.path / file).exists():
-                return path
-            if (path := self.path.parent / file).exists():
-                return path
-            return None
-
-        return {vt: locate(f) for vt, f in self._FILES.items()}
-
-    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
-        for vtype in vocab_types:
-            try:
-                path = self.file_paths[vtype]
-            except KeyError:
-                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
-            if path is not None:
-                return vtype, path
-        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
-
-    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
+    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocab.name == "bpe"
-        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
+        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
        return gguf.SpecialVocab(
            model_parent_path,
            load_merges=load_merges,
@ -1331,27 +1364,29 @@ class VocabFactory:
        )

    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
-        vocab_type, path = self._select_file(vocab_types)
-        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
+        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
+        selected_vocabs: dict[str, type[Vocab]] = {}
+        for vtype in vocab_types:
+            try:
+                selected_vocabs[vtype] = vocab_classes[vtype]
+            except KeyError:
+                raise ValueError(f"Unsupported vocabulary type {vtype}") from None

-        added_tokens_path = path.parent / "added_tokens.json"
-        if vocab_type == "bpe":
-            return BpeVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
-            )
-        if vocab_type == "spm":
-            return SentencePieceVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
-            )
-        if vocab_type == "hfft":
-            return HfVocab(
-                path.parent, added_tokens_path if added_tokens_path.exists() else None
-            )
-        raise ValueError(vocab_type)
+        for vtype, cls in selected_vocabs.items():
+            try:
+                vocab = cls(self.path)
+                break
+            except FileNotFoundError:
+                pass  # ignore unavailable tokenizers
+        else:
+            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")

-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
-        vocab: Vocab
-        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
+        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
+        return vocab
+
+    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
+        vocab: BaseVocab
+        if vocab_types is None:
            vocab = NoVocab()
        else:
            vocab = self._create_vocab_by_path(vocab_types)
@ -1408,10 +1443,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")

    args = parser.parse_args(args_in)
-    if args.no_vocab:
-        if args.vocab_only:
-            raise ValueError("no need to specify --vocab-only if using --no-vocab")
-        args.vocab_type = "no_vocab"
+    if args.no_vocab and args.vocab_only:
+        raise ValueError("--vocab-only does not make sense with --no-vocab")

    if args.dump_single:
        model_plus = lazy_load_file(args.model)
@ -1433,10 +1466,12 @@ def main(args_in: list[str] | None = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                            "Please specify one with --ctx:\n"
-                            " - LLaMA v1: --ctx 2048\n"
-                            " - LLaMA v2: --ctx 4096\n")
+            msg = """\
+                The model doesn't have a context size, and you didn't specify one with --ctx
+                Please specify one with --ctx:
+                 - LLaMA v1: --ctx 2048
+                 - LLaMA v2: --ctx 4096"""
+            parser.error(textwrap.dedent(msg))
        params.n_ctx = args.ctx

    if args.outtype:
@ -1451,9 +1486,11 @@ def main(args_in: list[str] | None = None) -> None:
    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
+    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
+    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)

    if args.vocab_only:
+        assert isinstance(vocab, Vocab)
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -178,25 +178,27 @@ int main(int argc, char ** argv) {
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);

-    // print the first part of the embeddings
+    // print the first part of the embeddings or for a single prompt, the full embedding
    fprintf(stdout, "\n");
    for (int j = 0; j < n_prompts; j++) {
        fprintf(stdout, "embedding %d: ", j);
-        for (int i = 0; i < std::min(16, n_embd); i++) {
+        for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
        }
        fprintf(stdout, "\n");
    }

    // print cosine similarity matrix
-    fprintf(stdout, "\n");
-    printf("cosine similarity matrix:\n\n");
-    for (int i = 0; i < n_prompts; i++) {
-        for (int j = 0; j < n_prompts; j++) {
-            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-            fprintf(stdout, "%6.2f ", sim);
-        }
+    if (n_prompts > 1) {
        fprintf(stdout, "\n");
+        printf("cosine similarity matrix:\n\n");
+        for (int i = 0; i < n_prompts; i++) {
+            for (int j = 0; j < n_prompts; j++) {
+                float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                fprintf(stdout, "%6.2f ", sim);
+            }
+            fprintf(stdout, "\n");
+        }
    }

    // clean up
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com

 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.

-Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
+Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.

 ## Usage
 Build with cmake or run `make llava-cli` to build it.
@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
@ -78,7 +78,7 @@ cd examples/llava/android/build_64
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`

-## some result on Android with `Snapdragon 888` chip
+## Some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
@ -109,7 +109,6 @@ llama_print_timings:       total time =   34731.93 ms
    --image /data/local/tmp/cat.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
-
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
@ -121,12 +120,82 @@ llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 m
 llama_print_timings:       total time =   34570.79 ms
 ```

+
+## Some result on Android with `Snapdragon 778G` chip
+### MobileVLM-1.7B case
+#### llava-cli release-b2005
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/many_llamas.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ A group of llamas are standing in a green pasture.
+
+llama_print_timings:        load time =   20357.33 ms
+llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
+llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
+llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
+llama_print_timings:       total time =   28038.34 ms /   205 tokens
+```
+#### llava-cli latest-version
+**input**
+
+Just the same as above.
+
+**output**(seems to be much slower)
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ It is a group of sheep standing together in a grass field.
+
+llama_print_timings:        load time =  818120.91 ms
+llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
+llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
+llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
+llama_print_timings:       total time =  865441.76 ms /   204 tokens
+```
+### MobileVLM_V2-1.7B case
+#### llava-cli release-2005b
+**input**
+
+Just the same as above.
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
+
+The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
+
+llama_print_timings:        load time =   22406.77 ms
+llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
+llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
+llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
+llama_print_timings:       total time =   44411.01 ms /   377 tokens
+```
+
 ## Orin compile and run
 ### compile
 ```sh
 make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```
-
 ### run on Orin
 ### case 1
 **input**
@ -175,8 +244,121 @@ llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 m
 llama_print_timings:       total time =    1365.47 ms /   243 tokens
 ```

-## Minor shortcomings
-The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+## Running on Intel(R) Core(TM) i7-10750H
+### Operating system
+Ubuntu22.04
+### compile
+```sh
+make -j32
+```
+### MobileVLM-1.7B case
+**input**
+```sh
+-m /path/to/ggml-model-q4_k.gguf \
+    --mmproj /path/to/mmproj-model-f16.gguf \
+    --image /path/to/many_llamas.jpeg
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
+```
+**output**
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that?ASSISTANT:
+
+ A group of llamas are walking together in a field.
+
+llama_print_timings:        load time =    5506.60 ms
+llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
+llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
+llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
+llama_print_timings:       total time =    5990.25 ms /   202 tokens
+```
+
+### MobileVLM_V2-1.7B case
+**input**
+
+Just the same as above.
+
+**ouput**
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that?ASSISTANT:
+
+ The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
+
+The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
+
+The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
+
+llama_print_timings:        load time =    6642.61 ms
+llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
+llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
+llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
+llama_print_timings:       total time =   15513.95 ms /   412 tokens
+```
+
+## Run on Intel(R) Core(TM) Ultra7 115H
+### operation system
+Windows11
+### comiple
+```sh
+make -j32
+```
+### MobileVLM-1.7B case
+**input**
+```sh
+-m /path/to/ggml-model-q4_k.gguf \
+    --mmproj /path/to/tmp/mmproj-model-f16.gguf \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ The image features a group of brown and white llamas standing in a grassy field.
+
+llama_print_timings:        load time =    7441.06 ms
+llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
+llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
+llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
+llama_print_timings:       total time =    7987.23 ms /   209 tokens
+```
+
+### MobileVLM_V2-1.7B case
+**input**
+
+Just the same as above.
+
+**output**
+```sh
+encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
+ of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
+
+The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
+
+The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
+ front is not visible, indicating that it might not be the main focus of the photo.
+
+The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
+
+
+llama_print_timings:        load time =    7015.35 ms
+llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
+llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
+llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
+llama_print_timings:       total time =   14371.19 ms /   446 tokens
+```

 ## TODO

@ -191,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic

 ## contributor
 ```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -835,9 +835,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
-            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
            embeddings = peg_0;
        }
@ -1755,7 +1756,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {

    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
        n_patches /= 4;
    }

--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -296,7 +296,9 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Batch Size

-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+
+- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.

 ### Prompt Caching

--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@ -0,0 +1,303 @@
+import argparse
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import threading
+import time
+import traceback
+from contextlib import closing
+from datetime import datetime
+
+import matplotlib
+import matplotlib.dates
+import matplotlib.pyplot as plt
+import requests
+
+
+def main(args_in: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description="Start server benchmark scenario")
+    parser.add_argument("--name", type=str, help="Bench name", required=True)
+    parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
+    parser.add_argument("--branch", type=str, help="Branch name", default="detached")
+    parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
+    parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, help="Server listen host", default="8080")
+    parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
+    parser.add_argument("--n-prompts", type=int,
+                        help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
+    parser.add_argument("--max-prompt-tokens", type=int,
+                        help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
+                        required=True)
+    parser.add_argument("--max-tokens", type=int,
+                        help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
+                        required=True)
+    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
+    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
+    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
+    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
+    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
+    parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
+    parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
+    parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
+    parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+
+    args = parser.parse_args(args_in)
+
+    start_time = time.time()
+
+    # Start the server and performance scenario
+    try:
+        server_process = start_server(args)
+    except Exception:
+        print("bench: server start error :")
+        traceback.print_exc(file=sys.stdout)
+        sys.exit(1)
+
+    # start the benchmark
+    try:
+        start_benchmark(args)
+
+        iterations = 0
+        with open("results.github.env", 'w') as github_env:
+            # parse output
+            with open('k6-results.json', 'r') as bench_results:
+                # Load JSON data from file
+                data = json.load(bench_results)
+                for metric_name in data['metrics']:
+                    for metric_metric in data['metrics'][metric_name]:
+                        value = data['metrics'][metric_name][metric_metric]
+                        if isinstance(value, float) or isinstance(value, int):
+                            value = round(value, 2)
+                            data['metrics'][metric_name][metric_metric]=value
+                            github_env.write(
+                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
+                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
+                iterations = data['root_group']['checks']['success completion']['passes']
+
+    except Exception:
+        print("bench: error :")
+        traceback.print_exc(file=sys.stdout)
+
+    # Stop the server
+    if server_process:
+        try:
+            print(f"bench: shutting down server pid={server_process.pid} ...")
+            if os.name == 'nt':
+                interrupt = signal.CTRL_C_EVENT
+            else:
+                interrupt = signal.SIGINT
+            server_process.send_signal(interrupt)
+            server_process.wait(0.5)
+
+        except subprocess.TimeoutExpired:
+            print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
+            server_process.kill()  # SIGKILL
+            server_process.wait()
+
+        while is_server_listening(args.host, args.port):
+            time.sleep(0.1)
+
+    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
+             f"duration={args.duration} {iterations} iterations")
+    xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
+              f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
+              f"branch={args.branch} commit={args.commit}")
+
+    # Prometheus
+    end_time = time.time()
+    if is_server_listening("0.0.0.0", 9090):
+        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
+                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
+
+        for metric in metrics:
+            resp = requests.get(f"http://localhost:9090/api/v1/query_range",
+                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+
+            with open(f"{metric}.json", 'w') as metric_json:
+                metric_json.write(resp.text)
+
+            if resp.status_code != 200:
+                print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
+            else:
+                metric_data = resp.json()
+                values = metric_data['data']['result'][0]['values']
+                timestamps, metric_values = zip(*values)
+                metric_values = [float(value) for value in metric_values]
+                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
+                plt.figure(figsize=(16, 10), dpi=80)
+                plt.plot(timestamps_dt, metric_values, label=metric)
+                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
+                plt.yticks(fontsize=12, alpha=.7)
+
+                ylabel = f"llamacpp:{metric}"
+                plt.title(title,
+                          fontsize=14, wrap=True)
+                plt.grid(axis='both', alpha=.3)
+                plt.ylabel(ylabel, fontsize=22)
+                plt.xlabel(xlabel, fontsize=14, wrap=True)
+                plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
+                plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
+                plt.gcf().autofmt_xdate()
+
+                # Remove borders
+                plt.gca().spines["top"].set_alpha(0.0)
+                plt.gca().spines["bottom"].set_alpha(0.3)
+                plt.gca().spines["right"].set_alpha(0.0)
+                plt.gca().spines["left"].set_alpha(0.3)
+
+                # Save the plot as a jpg image
+                plt.savefig(f'{metric}.jpg', dpi=60)
+                plt.close()
+
+                # Mermaid format in case images upload failed
+                with (open(f"{metric}.mermaid", 'w') as mermaid_f):
+                    mermaid = (
+                    f"""---
+config:
+    xyChart:
+        titleFontSize: 12
+        width: 900
+        height: 600
+    themeVariables:
+        xyChart:
+            titleColor: "#000000"
+---
+xychart-beta
+    title "{title}"
+    y-axis "llamacpp:{metric}"
+    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
+    line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
+                    """)
+                    mermaid_f.write(mermaid)
+
+    # 140 chars max for commit status description
+    bench_results = {
+        "req": {
+            "p90": data['metrics']["http_req_duration"]["p(90)"],
+            "avg": data['metrics']["http_req_duration"]["avg"],
+        },
+        "pp": {
+            "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
+            "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
+        },
+        "tg": {
+            "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
+            "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
+        },
+    }
+    with open("results.github.env", 'a') as github_env:
+        github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
+        github_env.write(f"BENCH_ITERATIONS={iterations}\n")
+
+        title = title.replace('\n', ' ')
+        xlabel = xlabel.replace('\n', ' ')
+        github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
+        github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
+
+
+def start_benchmark(args):
+    k6_path = 'k6'
+    if 'BENCH_K6_BIN_PATH' in os.environ:
+        k6_path = os.environ['BENCH_K6_BIN_PATH']
+    k6_args = [
+        'run', args.scenario,
+        '--no-color',
+    ]
+    k6_args.extend(['--duration', args.duration])
+    k6_args.extend(['--iterations', args.n_prompts])
+    k6_args.extend(['--vus', args.parallel])
+    k6_args.extend(['--summary-export', 'k6-results.json'])
+    args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
+    args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
+    print(f"bench: starting k6 with: {args}")
+    k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
+    if k6_completed.returncode != 0:
+        raise Exception("bench: unable to run k6")
+
+
+def start_server(args):
+    server_process = start_server_background(args)
+
+    attempts = 0
+    max_attempts = 20
+    if 'GITHUB_ACTIONS' in os.environ:
+        max_attempts *= 2
+
+    while not is_server_listening(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not started"
+        print(f"bench:     waiting for server to start ...")
+        time.sleep(0.5)
+
+    print("bench: server started.")
+    return server_process
+
+
+def start_server_background(args):
+    # Start the server
+    server_path = '../../../build/bin/server'
+    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
+        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+    server_args = [
+        '--host', args.host,
+        '--port', args.port,
+    ]
+    model_file = args.model_path_prefix + os.path.sep + args.hf_file
+    model_dir  = os.path.dirname(model_file)
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    server_args.extend(['--model', model_file])
+    server_args.extend(['--hf-repo', args.hf_repo])
+    server_args.extend(['--hf-file', args.hf_file])
+    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
+    server_args.extend(['--ctx-size', args.ctx_size])
+    server_args.extend(['--parallel', args.parallel])
+    server_args.extend(['--batch-size', args.batch_size])
+    server_args.extend(['--ubatch-size', args.ubatch_size])
+    server_args.extend(['--n-predict', args.max_tokens * 2])
+    server_args.extend(['--defrag-thold', "0.1"])
+    server_args.append('--cont-batching')
+    server_args.append('--metrics')
+    server_args.extend(['--log-format', "text"])
+    args = [str(arg) for arg in [server_path, *server_args]]
+    print(f"bench: starting server with: {' '.join(args)}")
+    pkwargs = {
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
+    }
+    server_process = subprocess.Popen(
+        args,
+        **pkwargs)
+
+    def server_log(in_stream, out_stream):
+        for line in iter(in_stream.readline, b''):
+            print(line.decode('utf-8'), end='', file=out_stream)
+
+    thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
+    thread_stdout.start()
+    thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
+    thread_stderr.start()
+
+    return server_process
+
+
+def is_server_listening(server_fqdn, server_port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        result = sock.connect_ex((server_fqdn, server_port))
+        _is_server_listening = result == 0
+        if _is_server_listening:
+            print(f"server is listening on {server_fqdn}:{server_port}...")
+        return _is_server_listening
+
+
+def escape_metric_name(metric_name):
+    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/server/bench/prometheus.yml
+++ b/examples/server/bench/prometheus.yml
@ -0,0 +1,9 @@
+global:
+  scrape_interval:     10s
+  external_labels:
+    llamacpp: 'server'
+
+scrape_configs:
+  - job_name: 'llama.cpp server'
+    static_configs:
+      - targets: ['localhost:8080']
--- a/examples/server/bench/requirements.txt
+++ b/examples/server/bench/requirements.txt
@ -0,0 +1,2 @@
+matplotlib
+requests
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3566,6 +3566,7 @@ int main(int argc, char ** argv) {
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1114,7 +1114,10 @@ def start_server_background(context):
        server_args.append('--verbose')
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
-    print(f"starting server with: {context.server_path} {server_args}")
+
+    args = [str(arg) for arg in [context.server_path, *server_args]]
+    print(f"bench: starting server with: {' '.join(args)}")
+
    flags = 0
    if 'nt' == os.name:
        flags |= subprocess.DETACHED_PROCESS
@ -1130,16 +1133,14 @@ def start_server_background(context):
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)

-    def log_stdout(process):
-        for line in iter(process.stdout.readline, b''):
-            print(line.decode('utf-8'), end='')
-    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+    def server_log(in_stream, out_stream):
+        for line in iter(in_stream.readline, b''):
+            print(line.decode('utf-8'), end='', file=out_stream)
+
+    thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
    thread_stdout.start()

-    def log_stderr(process):
-        for line in iter(process.stderr.readline, b''):
-            print(line.decode('utf-8'), end='', file=sys.stderr)
-    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
+    thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
    thread_stderr.start()

    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
--- a/flake.nix
+++ b/flake.nix
@ -145,6 +145,7 @@
            # the same path you would with an overlay.
            legacyPackages = {
              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
            };
@ -155,6 +156,7 @@
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
                vulkan = config.packages.default.override { useVulkan = true; };
+                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
@ -168,9 +170,14 @@
              };

            # Packages exposed in `.#checks` will be built by the CI and by
-            # `nix flake check`. Currently we expose all packages, but we could
-            # make more granular choices
-            checks = config.packages;
+            # `nix flake check`.
+            #
+            # We could test all outputs e.g. as `checks = confg.packages`.
+            #
+            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
+            checks = {
+              inherit (config.packages) default vulkan;
+            };
          };
      };
 }
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -2968,7 +2968,7 @@ namespace dpct
 #include "ggml-common.h"

 static int g_ggml_sycl_debug=0;
-#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
+#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)

 #define CHECK_TRY_ERROR(expr)                                                  \
  [&]() {                                                                      \
@ -12868,6 +12868,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
 }

 void ggml_backend_sycl_print_sycl_devices() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
    int device_count = dpct::dev_mgr::instance().device_count();
    std::map<std::string, size_t> DeviceNums;
    fprintf(stderr, "found %d SYCL devices:\n", device_count);
@ -12925,7 +12926,9 @@ static void ggml_init_sycl() try {
    static bool initialized = false;

    if (!initialized) {
+        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+
        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);

 #if defined(GGML_SYCL_F16)
@ -14986,6 +14989,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
    dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];

+    bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
+                           main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
+
    SYCL_CHECK(
        CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));

@ -15016,24 +15022,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,

    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
    dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
+    if (no_mixed_dtypes) {
+        cu_compute_type = dpct::library_data_t::real_half;
+        cu_data_type = dpct::library_data_t::real_half;
+    }

    // dst strides
    size_t nbd2 = dst->nb[2];
    size_t nbd3 = dst->nb[3];

+    const float alpha_f32 = 1.0f;
+    const float beta_f32 = 0.0f;
+
    const sycl::half alpha_f16 = 1.0f;
    const sycl::half beta_f16 = 0.0f;

-    const float alpha_f32 = 1.0f;
-    const float beta_f32  = 0.0f;
-
    const void * alpha = &alpha_f32;
    const void * beta  = &beta_f32;
+    if (no_mixed_dtypes) {
+        alpha = &alpha_f16;
+        beta  = &beta_f16;
+    }

    // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
-    // oneMKL open source supports half, half, float, float: datatypes
+    // when oneMKL open source supports half, half, float, float: datatypes

    dst_t = (char *) dst_ddf;
+    if (no_mixed_dtypes) {
+        dst_t = (char *) dst_f16.alloc(ne_dst);
+
+        nbd2 /= sizeof(float) / sizeof(sycl::half);
+        nbd3 /= sizeof(float) / sizeof(sycl::half);
+    }

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
@ -15119,6 +15139,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    }
 #endif

+    if (no_mixed_dtypes) {
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
+    }
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -16018,6 +16042,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
 }

 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
    for(int i=0;i<max_len;i++) id_list[i] = -1;

    if (!g_sycl_gpu_mgr) {
@ -16052,6 +16077,7 @@ catch (sycl::exception const &exc) {

 GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
                                      size_t description_size) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
    dpct::device_info prop;
    int device_id = g_sycl_gpu_mgr->gpus[device];
    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@ -16066,6 +16092,7 @@ catch (sycl::exception const &exc) {

 GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
                                                   size_t *total) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
    ggml_sycl_set_device(device);

    /*
@ -16417,7 +16444,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 };

 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
-    ggml_init_sycl();
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
+
    if (device_index>=g_device_count or device_index<0) {
        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
            device_index, g_device_count-1);
@ -16787,6 +16815,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
 };

 GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
    ggml_init_sycl();
    // FIXME: this is not thread safe
    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
@ -16859,6 +16888,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
 }

 ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
        /* .iface    = */ {
            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
@ -17155,6 +17185,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
 }

 GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
    ggml_init_sycl();

    check_allow_gpu_index(device);
@ -17181,6 +17212,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
 }

 GGML_CALL int ggml_backend_sycl_get_device_count() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
    if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
    return g_sycl_gpu_mgr->get_gpu_count();
 }
@ -17193,16 +17225,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
 }

 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
    return g_sycl_gpu_mgr->get_index(device_id);
 }

 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
    return g_sycl_gpu_mgr->gpus[device_index];
 }

 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
-    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+    ggml_init_sycl();
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+
    if (g_sycl_gpu_mgr) {
        delete g_sycl_gpu_mgr;
    }
@ -17213,6 +17250,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
 }

 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
+    ggml_init_sycl();
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
+
    if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
        return;
    }
--- a/ggml.c
+++ b/ggml.c
@ -3000,7 +3000,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        data_size *= ne[i];
    }

-    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
+    GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));

    void * data = view_src != NULL ? view_src->data : NULL;
    if (data != NULL) {
--- a/llama.cpp
+++ b/llama.cpp
@ -9192,8 +9192,9 @@ struct llm_build_context {
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
            }

            struct ggml_tensor * attn_out = cur;
--- a/llama.h
+++ b/llama.h
@ -60,9 +60,9 @@ extern "C" {

    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
+        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
    };

    // note: these values should be synchronized with ggml_rope