Merge branch 'master' into gg/flash-attn

This commit is contained in:
Georgi Gerganov 2024-03-28 19:32:51 +02:00
commit 3e318e764f
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
21 changed files with 1316 additions and 458 deletions

View File

@ -24,7 +24,7 @@
useOpenCL
useRocm
useVulkan
],
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMpi ? false, # Increases the runtime closure size by ~700M
@ -67,10 +67,15 @@ let
strings.optionalString (suffices != [ ])
", accelerated with ${strings.concatStringsSep ", " suffices}";
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
#
# TODO: Package up each Python script or service appropriately, by making
# them into "entrypoints"
llama-python = python3.withPackages (
ps: [
ps.numpy
@ -159,11 +164,6 @@ effectiveStdenv.mkDerivation (
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
# TODO: Package up each Python script or service appropriately.
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
# we could make those *.py into setuptools' entrypoints
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
'';
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
@ -244,8 +244,8 @@ effectiveStdenv.mkDerivation (
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
mkdir -p $out/include
cp $src/llama.h $out/include/
'';

280
.github/workflows/bench.yml vendored Normal file
View File

@ -0,0 +1,280 @@
# Benchmark
name: Benchmark
on:
workflow_dispatch:
inputs:
gpu-series:
description: 'Azure GPU series to run with'
required: true
type: choice
options:
- Standard_NC4as_T4_v3
- Standard_NC24ads_A100_v4
- Standard_NC80adis_H100_v5
sha:
description: 'Commit SHA1 to build'
required: false
type: string
duration:
description: 'Duration of the bench'
type: string
default: 10m
push:
branches:
- master
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
schedule:
- cron: '04 2 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
bench-server-baseline:
runs-on: Standard_NC4as_T4_v3
env:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
DURATION: 10m
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Install python env
id: pipenv
run: |
cd examples/server/bench
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
- name: Prometheus
id: install_prometheus
run: |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
tar xzf prometheus*.tar.gz --strip-components=1
./prometheus --config.file=examples/server/bench/prometheus.yml &
while ! nc -z localhost 9090; do
sleep 0.1
done
- name: Install k6
id: k6_installation
run: |
cd examples/server/bench
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
tar xzf k6*.tar.gz --strip-components=1
- name: Build
id: cmake_build
run: |
set -eux
mkdir build
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-DCMAKE_CUDA_ARCHITECTURES=75 \
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build . --config Release -j $(nproc) --target server
- name: Download the dataset
id: download_dataset
run: |
cd examples/server/bench
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- name: Server bench
id: server_bench
run: |
set -eux
cd examples/server/bench
source venv/bin/activate
BENCH_K6_BIN_PATH=./k6 python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
--parallel ${{ env.N_USERS }} \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
--ctx-size 16384 \
--n-prompts 1000 \
--max-prompt-tokens 1024 \
--max-tokens 2048
cat results.github.env >> $GITHUB_ENV
# Remove dataset as we do not want it in the artefact
rm ShareGPT_V3_unfiltered_cleaned_split.json
- uses: actions/upload-artifact@v4
with:
name: benchmark-results
compression-level: 9
path: |
examples/server/bench/*.jpg
examples/server/bench/*.json
examples/server/bench/*.log
- name: Commit status
uses: Sibz/github-status-action@v1
continue-on-error: true # If not authorized on external repo
with:
authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-baseline
description: |
${{ env.BENCH_RESULTS }}
state: 'success'
- name: Upload benchmark images
uses: devicons/public-upload-to-imgur@v2.2.2
continue-on-error: true # Important as it looks unstable: 503
id: imgur_step
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
path: |
examples/server/bench/prompt_tokens_seconds.jpg
examples/server/bench/predicted_tokens_seconds.jpg
examples/server/bench/kv_cache_usage_ratio.jpg
examples/server/bench/requests_processing.jpg
- name: Extract mermaid
id: set_mermaid
run: |
set -eux
cd examples/server/bench
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Extract image url
id: extract_image_url
continue-on-error: true
run: |
set -eux
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' }}
with:
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }}
<details>
<summary>Time series</summary>
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
<details>
<summary>More</summary>
```mermaid
${{ env.PROMPT_TOKENS_SECONDS }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
<details>
<summary>More</summary>
```mermaid
${{ env.PREDICTED_TOKENS_SECONDS }}
```
</details>
</p>
<details>
<summary>Details</summary>
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
<details>
<summary>More</summary>
```mermaid
${{ env.KV_CACHE_USAGE_RATIO }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
<details>
<summary>More</summary>
```mermaid
${{ env.REQUESTS_PROCESSING }}
```
</details>
</p>
</details>
</details>

View File

@ -556,6 +556,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

View File

@ -3,7 +3,7 @@
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Intel GPU](#intel-gpu)
- [Supported Devices](#supported-devices)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
@ -14,17 +14,25 @@
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
### Llama.cpp + SYCL
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
The llama.cpp for SYCL is used to support Intel GPUs.
The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
*Upcoming support: AMD GPUs*.
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
## News
@ -51,9 +59,16 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|Windows|Support|Windows 11|
## Intel GPU
## Supported devices
### Verified
### Intel GPUs
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
```sh
source /opt/intel/oneapi/setvars.sh
```
- **Tested devices**
|Intel GPU| Status | Verified Model|
|-|-|-|
@ -63,198 +78,229 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
*Notes:*
### Memory
- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
The memory is a limitation to run LLM on GPUs.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
### Nvidia GPUs
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
- **Tested devices**
## Nvidia GPU
### Verified
|Intel GPU| Status | Verified Model|
|Nvidia GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100|
|Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
### oneMKL for CUDA
*Notes:*
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
The current oneMKL release does not contain the oneMKL cuBlas backend.
As a result for Nvidia GPU's oneMKL must be built from source.
- Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
```
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir build
cd build
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
ninja
// Add paths as necessary
```
## Docker
Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# For F16:
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
# Using FP16
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
```
### Run
*Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container
```sh
# Firstly, find all the DRI cards:
# First, find all the DRI cards
ls -la /dev/dri
# Then, pick the card that you want to use.
# For example with "/dev/dri/card1"
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
*Notes:*
- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux
### Setup Environment
### I. Setup Environment
1. Install Intel GPU driver.
1. **Install GPU drivers**
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
- **Intel GPU**
Note: for iGPU, please install the client GPU driver.
Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
b. Add user to group: video, render.
*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
Once installed, add the user(s) to the `video` and `render` groups.
```sh
sudo usermod -aG render username
sudo usermod -aG video username
sudo usermod -aG render $USER
sudo usermod -aG video $USER
```
Note: re-login to enable it.
*Note*: logout/re-login for the changes to take effect.
c. Check
Verify installation through `clinfo`:
```sh
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
Sample output:
```
```sh
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel® oneAPI Base toolkit.
- **Nvidia GPU**
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
Installation can be verified by running the following:
```sh
nvidia-smi
```
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
```
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
```
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
2. **Install Intel® oneAPI Base toolkit**
b. Check
- **Base installation**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
- **Adding support to Nvidia GPUs**
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
source /opt/intel/oneapi/setvars.sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir -p buildWithCublas && cd buildWithCublas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
make
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
- **Intel GPU**
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
- **Nvidia GPU**
Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
```
### II. Build llama.cpp
#### Intel GPU
```sh
mkdir -p build
cd build
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# For FP16:
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
# Or, for FP32:
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# For Nvidia GPUs
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Build example/main only
#cmake --build . --config Release --target main
# Or, build all binary
cmake --build . --config Release -v
cd ..
```
or
#### Nvidia GPU
```sh
./examples/sycl/build.sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
### Run
### III. Run the inference
1. Put model file to folder **models**
1. Retrieve and prepare model
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
```
```sh
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
3. List devices information
Run without parameter:
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh
./build/bin/ls-sycl-device
# or running the "main" executable and look at the output log:
./build/bin/main
```
Check the ID in startup log, like:
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
```
found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
@ -270,15 +316,15 @@ found 6 SYCL devices:
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
4. Device selection and execution of llama.cpp
4. Launch inference
There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|Device selection|Parameter|
|-|-|
@ -303,74 +349,64 @@ or run by script:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
or run by script:
Otherwise, you can run the script:
```sh
./examples/sycl/run_llama2.sh
```
Note:
*Notes:*
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
5. Verify the device ID in output
Verify to see if the selected GPU is shown in the output, like:
```
```sh
detect 1 SYCL GPUs: [0] with top Max compute units:512
```
Or
```
```sh
use 1 SYCL GPUs: [0] with Max compute units:512
```
## Windows
### Setup Environment
### I. Setup Environment
1. Install Intel GPU driver.
1. Install GPU driver
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Note: **The driver is mandatory for compute function**.
2. Install Visual Studio
2. Install Visual Studio.
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
3. Install Intel® oneAPI Base toolkit
3. Install Intel® oneAPI Base toolkit.
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
b. Enable oneAPI running environment:
- In Search, input 'oneAPI'.
- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
- On the command prompt, enable the runtime environment with the following:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
c. Check GPU
c. Verify installation
In oneAPI command line:
In the oneAPI command line, run the following to print the available SYCL devices:
```
sycl-ls
```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
Output (example):
```
@ -380,7 +416,7 @@ Output (example):
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
4. Install cmake & make
4. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/
@ -390,76 +426,53 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
### Build locally:
### II. Build llama.cpp
In oneAPI command line window:
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP32
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..
make
```
or
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
```sh
.\examples\sycl\win-build-sycl.bat
```
Note:
*Notes:*
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
### Run
### III. Run the inference
1. Put model file to folder **models**
1. Retrieve and prepare model
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
On the oneAPI command line window, run the following and step into the llama.cpp directory:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
3. List device ID
3. List devices information
Run without parameter:
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```
build\bin\ls-sycl-device.exe
or
build\bin\main.exe
```
Check the ID in startup log, like:
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
```
found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
@ -480,7 +493,7 @@ found 6 SYCL devices:
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Device selection and execution of llama.cpp
4. Launch inference
There are two device selection modes:
@ -505,7 +518,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be
```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
or run by script:
Otherwise, run the following wrapper script:
```
.\examples\sycl\win-run-llama2.bat
@ -513,19 +526,14 @@ or run by script:
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
5. Verify the device ID in output
Verify to see if the selected GPU is shown in the output, like:
```
```sh
detect 1 SYCL GPUs: [0] with top Max compute units:512
```
Or
```
```sh
use 1 SYCL GPUs: [0] with Max compute units:512
```
@ -535,64 +543,54 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
#### Running
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
#### Runtime
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issue
## Known Issues
- Hang during startup
- Hanging during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
Solution: add **--no-mmap** or **--mmap 0**.
- **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
- Split-mode: [row] is not supported
It's on developing.
- `Split-mode:[row]` is not supported.
## Q&A
Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
- Potential cause: Unavailable oneAPI installation or not set ENV variables.
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- General compiler error:
- In Windows, no result, not error.
- Remove build folder or try a clean-build.
Miss to enable oneAPI running environment.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
- Meet compile error.
Please double-check with `sudo sycl-ls`.
Remove folder **build** and try again.
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
```
sudo usermod -aG render username
sudo usermod -aG video username
sudo usermod -aG render $USER
sudo usermod -aG video $USER
```
Otherwise, please double-check the GPU driver installation steps.
Then **relogin**.
If you do not see it, please check the installation GPU steps again.
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## Todo

View File

@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from convert import HfVocab
from convert import LlamaHfVocab
###### MODEL DEFINITIONS ######
@ -230,7 +230,7 @@ class Model(ABC):
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@ -243,8 +243,7 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@ -266,7 +265,7 @@ class Model(ABC):
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@ -291,8 +290,7 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@ -372,12 +370,8 @@ class Model(ABC):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_hf(self):
path = self.dir_model
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self):
self._set_vocab_hf()
self._set_vocab_llama_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
@ -1700,11 +1694,8 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None
# use huggingface vocab to get all tokens
vocab = HfVocab(path, added_tokens_path)
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size

View File

@ -106,12 +106,12 @@ def main():
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data = tensors[name]
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data.to(torch.float32).squeeze().numpy()
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")

View File

@ -16,13 +16,14 @@ import re
import signal
import struct
import sys
import textwrap
import time
import zipfile
from abc import ABCMeta, abstractmethod
from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
import numpy as np
from sentencepiece import SentencePieceProcessor
@ -43,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8
ADDED_TOKENS_FILE = 'added_tokens.json'
FAST_TOKENIZER_FILE = 'tokenizer.json'
#
# data types
#
@ -188,8 +192,10 @@ class Params:
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
msg = """\
failed to guess 'n_layer'. This model is unknown or unsupported.
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
raise KeyError(textwrap.dedent(msg))
n_head = n_embd // 128 # guessed
n_mult = 256 # guessed
@ -211,7 +217,8 @@ class Params:
@staticmethod
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
with open(config_path) as f:
config = json.load(f)
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
@ -233,8 +240,10 @@ class Params:
elif "max_position_embeddings" in config:
n_ctx = config["max_position_embeddings"]
else:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
msg = """\
failed to guess 'n_ctx'. This model is unknown or unsupported.
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
raise KeyError(textwrap.dedent(msg))
n_experts = None
n_experts_used = None
@ -265,7 +274,8 @@ class Params:
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
with open(config_path) as f:
config = json.load(f)
n_experts = None
n_experts_used = None
@ -331,47 +341,86 @@ class Params:
# vocab
#
class BpeVocab:
@runtime_checkable
class BaseVocab(Protocol):
tokenizer_model: ClassVar[str]
name: ClassVar[str]
class NoVocab(BaseVocab):
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
class Vocab(BaseVocab, Protocol):
vocab_size: int
added_tokens_dict: dict[str, int]
added_tokens_list: list[str]
fname_tokenizer: Path
def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
class BpeVocab(Vocab):
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
if isinstance(self.bpe_tokenizer.get('model'), dict):
self.vocab = self.bpe_tokenizer["model"]["vocab"]
else:
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
# Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer)
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
vocab_size: int = len(self.vocab)
if (fname_tokenizer := base_path / 'vocab.json').exists():
# "slow" tokenizer
with open(fname_tokenizer, encoding="utf-8") as f:
self.vocab = json.load(f)
try:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
else:
# "fast" tokenizer
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding="utf-8") as f:
tokenizer_json = json.load(f)
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'ByteLevel'
):
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
self.vocab = tokenizer_model["vocab"]
if (added := tokenizer_json.get('added_tokens')) is not None:
# Added tokens here can be duplicates of the main vocabulary.
added_tokens = {item['content']: item['id']
for item in added
if item['content'] not in self.vocab}
vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -392,19 +441,25 @@ class BpeVocab:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab:
class SentencePieceVocab(Vocab):
tokenizer_model = "llama"
name = "spm"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
# normal location
try:
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@ -419,13 +474,12 @@ class SentencePieceVocab:
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text: bytes = piece.encode("utf-8")
text = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = gguf.TokenType.NORMAL
@ -458,27 +512,42 @@ class SentencePieceVocab:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class HfVocab:
class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
tokenizer_json = json.load(f)
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ignore_nonllama:
pass # workaround incorrect use of this class for WordPiece
elif (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use HfVocab, please install the `transformers` package. "
"To use LlamaHfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
print("fname_tokenizer:", fname_tokenizer)
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
self.tokenizer = AutoTokenizer.from_pretrained(
fname_tokenizer,
cache_dir=fname_tokenizer,
base_path,
cache_dir=base_path,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
@ -507,7 +576,6 @@ class HfVocab:
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
@ -559,18 +627,7 @@ class HfVocab:
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class NoVocab:
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
#
@ -588,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
.reshape(weights.shape))
class Tensor(metaclass=ABCMeta):
class Tensor(ABC):
data_type: DataType
@abstractmethod
@ -610,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
class UnquantizedTensor(Tensor):
def __init__(self, ndarray: NDArray) -> None:
def __init__(self, ndarray: NDArray):
assert isinstance(ndarray, np.ndarray)
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
@ -689,7 +746,7 @@ class ModelPlus:
model: LazyModel
paths: list[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors', 'none']
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
def merge_sharded(models: list[LazyModel]) -> LazyModel:
@ -698,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
names = {name: None for model in models for name in model}
def convert(name: str) -> LazyTensor:
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
lazy_tensors = [model[name] for model in models]
if len(lazy_tensors) == 1:
# only one file; don't go through this procedure since there might
# be quantized tensors
@ -719,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
def load() -> UnquantizedTensor:
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
concatenated = np.concatenate(ndarrays, axis=axis)
return UnquantizedTensor(concatenated)
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@ -807,7 +864,7 @@ class LazyUnpickler(pickle.Unpickler):
def load(offset: int, elm_count: int) -> NDArray:
dtype = data_type.dtype
fp = self.zip_file.open(info)
with self.zip_file.open(info) as fp:
fp.seek(offset * dtype.itemsize)
size = elm_count * dtype.itemsize
data = fp.read(size)
@ -831,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES: dict[tuple[str, str], Any] = {
CLASSES = {
# getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -890,7 +947,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
def must_read(fp: IO[bytes], length: int) -> bytes:
ret = fp.read(length)
if len(ret) < length:
raise Exception("unexpectedly reached end of file")
raise EOFError("unexpectedly reached end of file")
return ret
@ -948,13 +1005,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield result
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
# Handle special case where the model's vocab size is not set
if params.n_vocab == -1:
raise ValueError(
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
"The model's vocab size is set to -1 in params.json. Please update it manually."
+ (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
)
if isinstance(vocab, NoVocab):
if not isinstance(vocab, Vocab):
return # model has no vocab
# Check for a vocab size mismatch
@ -979,11 +1037,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
if vocab.vocab_size < params.n_vocab:
msg += " Add the --pad-vocab option and try again."
raise Exception(msg)
raise ValueError(msg)
class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
@ -1034,8 +1092,6 @@ class OutputFile:
self.gguf.add_file_type(params.ftype)
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
assert not isinstance(vocab, NoVocab)
tokens = []
scores = []
toktypes = []
@ -1135,7 +1191,7 @@ class OutputFile:
@staticmethod
def write_all(
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
@ -1145,11 +1201,11 @@ class OutputFile:
# meta data
of.add_meta_arch(params)
if isinstance(vocab, NoVocab):
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
else:
if isinstance(vocab, Vocab):
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
else: # NoVocab
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
# tensor info
for name, lazy_tensor in model.items():
@ -1176,7 +1232,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
raise Exception(f"Unexpected combination of types: {name_to_type}")
raise ValueError(f"Unexpected combination of types: {name_to_type}")
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1186,7 +1242,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
tmp = model
@ -1213,8 +1269,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping")
continue
else:
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip:
print(f"skipping tensor {name_new}")
@ -1231,7 +1286,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
the nth path in the model.
'''
# Support the following patterns:
patterns: list[tuple[str, str]] = [
patterns = [
# - x.00.pth, x.01.pth, etc.
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@ -1277,9 +1332,9 @@ def load_some_model(path: Path) -> ModelPlus:
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
raise Exception(f"Can't find model in directory {path}")
raise FileNotFoundError(f"Can't find model in directory {path}")
if len(files) > 1:
raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
path = files[0]
paths = find_multifile_paths(path)
@ -1293,36 +1348,14 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory:
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
def __init__(self, path: Path):
self.path = path
self.file_paths = self._detect_files()
print(f"Found vocab files: {self.file_paths}")
def _detect_files(self) -> dict[str, Path | None]:
def locate(file: str) -> Path | None:
if (path := self.path / file).exists():
return path
if (path := self.path.parent / file).exists():
return path
return None
return {vt: locate(f) for vt, f in self._FILES.items()}
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
for vtype in vocab_types:
try:
path = self.file_paths[vtype]
except KeyError:
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
if path is not None:
return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocab.name == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
return gguf.SpecialVocab(
model_parent_path,
load_merges=load_merges,
@ -1331,27 +1364,29 @@ class VocabFactory:
)
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
selected_vocabs: dict[str, type[Vocab]] = {}
for vtype in vocab_types:
try:
selected_vocabs[vtype] = vocab_classes[vtype]
except KeyError:
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
added_tokens_path = path.parent / "added_tokens.json"
if vocab_type == "bpe":
return BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
if vocab_type == "spm":
return SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
if vocab_type == "hfft":
return HfVocab(
path.parent, added_tokens_path if added_tokens_path.exists() else None
)
raise ValueError(vocab_type)
for vtype, cls in selected_vocabs.items():
try:
vocab = cls(self.path)
break
except FileNotFoundError:
pass # ignore unavailable tokenizers
else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
vocab: Vocab
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
vocab: BaseVocab
if vocab_types is None:
vocab = NoVocab()
else:
vocab = self._create_vocab_by_path(vocab_types)
@ -1408,10 +1443,8 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
if args.no_vocab:
if args.vocab_only:
raise ValueError("no need to specify --vocab-only if using --no-vocab")
args.vocab_type = "no_vocab"
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
if args.dump_single:
model_plus = lazy_load_file(args.model)
@ -1433,10 +1466,12 @@ def main(args_in: list[str] | None = None) -> None:
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n")
msg = """\
The model doesn't have a context size, and you didn't specify one with --ctx
Please specify one with --ctx:
- LLaMA v1: --ctx 2048
- LLaMA v2: --ctx 4096"""
parser.error(textwrap.dedent(msg))
params.n_ctx = args.ctx
if args.outtype:
@ -1451,9 +1486,11 @@ def main(args_in: list[str] | None = None) -> None:
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
if args.vocab_only:
assert isinstance(vocab, Vocab)
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile

View File

@ -178,17 +178,18 @@ int main(int argc, char ** argv) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
// print the first part of the embeddings
// print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < std::min(16, n_embd); i++) {
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
fprintf(stdout, "\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
@ -198,6 +199,7 @@ int main(int argc, char ** argv) {
}
fprintf(stdout, "\n");
}
}
// clean up
llama_print_timings(ctx);

View File

@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
## Usage
Build with cmake or run `make llava-cli` to build it.
@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
@ -78,7 +78,7 @@ cd examples/llava/android/build_64
### run on Android
refer to `android/adb_run.sh`, modify resources' `name` and `path`
## some result on Android with `Snapdragon 888` chip
## Some result on Android with `Snapdragon 888` chip
### case 1
**input**
```sh
@ -109,7 +109,6 @@ llama_print_timings: total time = 34731.93 ms
--image /data/local/tmp/cat.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
@ -121,12 +120,82 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m
llama_print_timings: total time = 34570.79 ms
```
## Some result on Android with `Snapdragon 778G` chip
### MobileVLM-1.7B case
#### llava-cli release-b2005
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/many_llamas.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 18728.52 ms by CLIP ( 130.06 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
A group of llamas are standing in a green pasture.
llama_print_timings: load time = 20357.33 ms
llama_print_timings: sample time = 2.96 ms / 14 runs ( 0.21 ms per token, 4734.53 tokens per second)
llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 ms per token, 23.52 tokens per second)
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
llama_print_timings: total time = 28038.34 ms / 205 tokens
```
#### llava-cli latest-version
**input**
Just the same as above.
**output**(seems to be much slower)
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
It is a group of sheep standing together in a grass field.
llama_print_timings: load time = 818120.91 ms
llama_print_timings: sample time = 3.44 ms / 14 runs ( 0.25 ms per token, 4067.40 tokens per second)
llama_print_timings: prompt eval time = 529274.69 ms / 191 tokens ( 2771.07 ms per token, 0.36 tokens per second)
llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 ms per token, 0.30 tokens per second)
llama_print_timings: total time = 865441.76 ms / 204 tokens
```
### MobileVLM_V2-1.7B case
#### llava-cli release-2005b
**input**
Just the same as above.
**output**
```sh
encode_image_with_clip: image encoded in 20609.61 ms by CLIP ( 143.12 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
llama_print_timings: load time = 22406.77 ms
llama_print_timings: sample time = 49.26 ms / 186 runs ( 0.26 ms per token, 3776.27 tokens per second)
llama_print_timings: prompt eval time = 9044.54 ms / 191 tokens ( 47.35 ms per token, 21.12 tokens per second)
llama_print_timings: eval time = 14497.49 ms / 186 runs ( 77.94 ms per token, 12.83 tokens per second)
llama_print_timings: total time = 44411.01 ms / 377 tokens
```
## Orin compile and run
### compile
```sh
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
```
### run on Orin
### case 1
**input**
@ -175,8 +244,121 @@ llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 m
llama_print_timings: total time = 1365.47 ms / 243 tokens
```
## Minor shortcomings
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
## Running on Intel(R) Core(TM) i7-10750H
### Operating system
Ubuntu22.04
### compile
```sh
make -j32
```
### MobileVLM-1.7B case
**input**
```sh
-m /path/to/ggml-model-q4_k.gguf \
--mmproj /path/to/mmproj-model-f16.gguf \
--image /path/to/many_llamas.jpeg
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
```
**output**
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 2730.94 ms by CLIP ( 18.96 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that?ASSISTANT:
A group of llamas are walking together in a field.
llama_print_timings: load time = 5506.60 ms
llama_print_timings: sample time = 0.44 ms / 13 runs ( 0.03 ms per token, 29545.45 tokens per second)
llama_print_timings: prompt eval time = 2031.58 ms / 190 tokens ( 10.69 ms per token, 93.52 tokens per second)
llama_print_timings: eval time = 438.92 ms / 12 runs ( 36.58 ms per token, 27.34 tokens per second)
llama_print_timings: total time = 5990.25 ms / 202 tokens
```
### MobileVLM_V2-1.7B case
**input**
Just the same as above.
**ouput**
```sh
encode_image_with_clip: image embedding created: 144 tokens
encode_image_with_clip: image encoded in 3223.89 ms by CLIP ( 22.39 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that?ASSISTANT:
The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
llama_print_timings: load time = 6642.61 ms
llama_print_timings: sample time = 8.15 ms / 223 runs ( 0.04 ms per token, 27358.61 tokens per second)
llama_print_timings: prompt eval time = 2475.07 ms / 190 tokens ( 13.03 ms per token, 76.77 tokens per second)
llama_print_timings: eval time = 8760.60 ms / 222 runs ( 39.46 ms per token, 25.34 tokens per second)
llama_print_timings: total time = 15513.95 ms / 412 tokens
```
## Run on Intel(R) Core(TM) Ultra7 115H
### operation system
Windows11
### comiple
```sh
make -j32
```
### MobileVLM-1.7B case
**input**
```sh
-m /path/to/ggml-model-q4_k.gguf \
--mmproj /path/to/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
```
**output**
```sh
encode_image_with_clip: image encoded in 4902.81 ms by CLIP ( 34.05 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
The image features a group of brown and white llamas standing in a grassy field.
llama_print_timings: load time = 7441.06 ms
llama_print_timings: sample time = 0.72 ms / 19 runs ( 0.04 ms per token, 26279.39 tokens per second)
llama_print_timings: prompt eval time = 2090.71 ms / 191 tokens ( 10.95 ms per token, 91.36 tokens per second)
llama_print_timings: eval time = 512.35 ms / 18 runs ( 28.46 ms per token, 35.13 tokens per second)
llama_print_timings: total time = 7987.23 ms / 209 tokens
```
### MobileVLM_V2-1.7B case
**input**
Just the same as above.
**output**
```sh
encode_image_with_clip: image encoded in 4682.44 ms by CLIP ( 32.52 ms per image patch)
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
user_prompt: \nWhat's that? ASSISTANT:
This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
front is not visible, indicating that it might not be the main focus of the photo.
The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
llama_print_timings: load time = 7015.35 ms
llama_print_timings: sample time = 10.61 ms / 256 runs ( 0.04 ms per token, 24119.09 tokens per second)
llama_print_timings: prompt eval time = 2052.45 ms / 191 tokens ( 10.75 ms per token, 93.06 tokens per second)
llama_print_timings: eval time = 7259.43 ms / 255 runs ( 28.47 ms per token, 35.13 tokens per second)
llama_print_timings: total time = 14371.19 ms / 446 tokens
```
## TODO
@ -191,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic
## contributor
```sh
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
```

View File

@ -835,9 +835,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1]
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
embeddings = peg_0;
}
@ -1755,7 +1756,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
n_patches /= 4;
}

View File

@ -296,7 +296,9 @@ These options help improve the performance and memory usage of the LLaMA models.
### Batch Size
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
### Prompt Caching

View File

@ -0,0 +1,303 @@
import argparse
import json
import os
import re
import signal
import socket
import subprocess
import sys
import threading
import time
import traceback
from contextlib import closing
from datetime import datetime
import matplotlib
import matplotlib.dates
import matplotlib.pyplot as plt
import requests
def main(args_in: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(description="Start server benchmark scenario")
parser.add_argument("--name", type=str, help="Bench name", required=True)
parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
parser.add_argument("--branch", type=str, help="Branch name", default="detached")
parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
parser.add_argument("--port", type=int, help="Server listen host", default="8080")
parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
parser.add_argument("--n-prompts", type=int,
help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
parser.add_argument("--max-prompt-tokens", type=int,
help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
required=True)
parser.add_argument("--max-tokens", type=int,
help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
required=True)
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
args = parser.parse_args(args_in)
start_time = time.time()
# Start the server and performance scenario
try:
server_process = start_server(args)
except Exception:
print("bench: server start error :")
traceback.print_exc(file=sys.stdout)
sys.exit(1)
# start the benchmark
try:
start_benchmark(args)
iterations = 0
with open("results.github.env", 'w') as github_env:
# parse output
with open('k6-results.json', 'r') as bench_results:
# Load JSON data from file
data = json.load(bench_results)
for metric_name in data['metrics']:
for metric_metric in data['metrics'][metric_name]:
value = data['metrics'][metric_name][metric_metric]
if isinstance(value, float) or isinstance(value, int):
value = round(value, 2)
data['metrics'][metric_name][metric_metric]=value
github_env.write(
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
iterations = data['root_group']['checks']['success completion']['passes']
except Exception:
print("bench: error :")
traceback.print_exc(file=sys.stdout)
# Stop the server
if server_process:
try:
print(f"bench: shutting down server pid={server_process.pid} ...")
if os.name == 'nt':
interrupt = signal.CTRL_C_EVENT
else:
interrupt = signal.SIGINT
server_process.send_signal(interrupt)
server_process.wait(0.5)
except subprocess.TimeoutExpired:
print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
server_process.kill() # SIGKILL
server_process.wait()
while is_server_listening(args.host, args.port):
time.sleep(0.1)
title = (f"llama.cpp {args.name} on {args.runner_label}\n "
f"duration={args.duration} {iterations} iterations")
xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
f"branch={args.branch} commit={args.commit}")
# Prometheus
end_time = time.time()
if is_server_listening("0.0.0.0", 9090):
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
for metric in metrics:
resp = requests.get(f"http://localhost:9090/api/v1/query_range",
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
with open(f"{metric}.json", 'w') as metric_json:
metric_json.write(resp.text)
if resp.status_code != 200:
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
else:
metric_data = resp.json()
values = metric_data['data']['result'][0]['values']
timestamps, metric_values = zip(*values)
metric_values = [float(value) for value in metric_values]
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
plt.figure(figsize=(16, 10), dpi=80)
plt.plot(timestamps_dt, metric_values, label=metric)
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
plt.yticks(fontsize=12, alpha=.7)
ylabel = f"llamacpp:{metric}"
plt.title(title,
fontsize=14, wrap=True)
plt.grid(axis='both', alpha=.3)
plt.ylabel(ylabel, fontsize=22)
plt.xlabel(xlabel, fontsize=14, wrap=True)
plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
plt.gcf().autofmt_xdate()
# Remove borders
plt.gca().spines["top"].set_alpha(0.0)
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)
plt.gca().spines["left"].set_alpha(0.3)
# Save the plot as a jpg image
plt.savefig(f'{metric}.jpg', dpi=60)
plt.close()
# Mermaid format in case images upload failed
with (open(f"{metric}.mermaid", 'w') as mermaid_f):
mermaid = (
f"""---
config:
xyChart:
titleFontSize: 12
width: 900
height: 600
themeVariables:
xyChart:
titleColor: "#000000"
---
xychart-beta
title "{title}"
y-axis "llamacpp:{metric}"
x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
""")
mermaid_f.write(mermaid)
# 140 chars max for commit status description
bench_results = {
"req": {
"p90": data['metrics']["http_req_duration"]["p(90)"],
"avg": data['metrics']["http_req_duration"]["avg"],
},
"pp": {
"p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
"avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
},
"tg": {
"p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
"avg": data['metrics']["llamacpp_tokens_second"]["avg"],
},
}
with open("results.github.env", 'a') as github_env:
github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
github_env.write(f"BENCH_ITERATIONS={iterations}\n")
title = title.replace('\n', ' ')
xlabel = xlabel.replace('\n', ' ')
github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
def start_benchmark(args):
k6_path = 'k6'
if 'BENCH_K6_BIN_PATH' in os.environ:
k6_path = os.environ['BENCH_K6_BIN_PATH']
k6_args = [
'run', args.scenario,
'--no-color',
]
k6_args.extend(['--duration', args.duration])
k6_args.extend(['--iterations', args.n_prompts])
k6_args.extend(['--vus', args.parallel])
k6_args.extend(['--summary-export', 'k6-results.json'])
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
print(f"bench: starting k6 with: {args}")
k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
if k6_completed.returncode != 0:
raise Exception("bench: unable to run k6")
def start_server(args):
server_process = start_server_background(args)
attempts = 0
max_attempts = 20
if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2
while not is_server_listening(args.host, args.port):
attempts += 1
if attempts > max_attempts:
assert False, "server not started"
print(f"bench: waiting for server to start ...")
time.sleep(0.5)
print("bench: server started.")
return server_process
def start_server_background(args):
# Start the server
server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
server_args = [
'--host', args.host,
'--port', args.port,
]
model_file = args.model_path_prefix + os.path.sep + args.hf_file
model_dir = os.path.dirname(model_file)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
server_args.extend(['--model', model_file])
server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file])
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
server_args.extend(['--ctx-size', args.ctx_size])
server_args.extend(['--parallel', args.parallel])
server_args.extend(['--batch-size', args.batch_size])
server_args.extend(['--ubatch-size', args.ubatch_size])
server_args.extend(['--n-predict', args.max_tokens * 2])
server_args.extend(['--defrag-thold', "0.1"])
server_args.append('--cont-batching')
server_args.append('--metrics')
server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
pkwargs = {
'stdout': subprocess.PIPE,
'stderr': subprocess.PIPE
}
server_process = subprocess.Popen(
args,
**pkwargs)
def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''):
print(line.decode('utf-8'), end='', file=out_stream)
thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
thread_stdout.start()
thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
thread_stderr.start()
return server_process
def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port))
_is_server_listening = result == 0
if _is_server_listening:
print(f"server is listening on {server_fqdn}:{server_port}...")
return _is_server_listening
def escape_metric_name(metric_name):
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
if __name__ == '__main__':
main()

View File

@ -0,0 +1,9 @@
global:
scrape_interval: 10s
external_labels:
llamacpp: 'server'
scrape_configs:
- job_name: 'llama.cpp server'
static_configs:
- targets: ['localhost:8080']

View File

@ -0,0 +1,2 @@
matplotlib
requests

View File

@ -3566,6 +3566,7 @@ int main(int argc, char ** argv) {
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;

View File

@ -1114,7 +1114,10 @@ def start_server_background(context):
server_args.append('--verbose')
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
print(f"starting server with: {context.server_path} {server_args}")
args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
flags = 0
if 'nt' == os.name:
flags |= subprocess.DETACHED_PROCESS
@ -1130,16 +1133,14 @@ def start_server_background(context):
[str(arg) for arg in [context.server_path, *server_args]],
**pkwargs)
def log_stdout(process):
for line in iter(process.stdout.readline, b''):
print(line.decode('utf-8'), end='')
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''):
print(line.decode('utf-8'), end='', file=out_stream)
thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
thread_stdout.start()
def log_stderr(process):
for line in iter(process.stderr.readline, b''):
print(line.decode('utf-8'), end='', file=sys.stderr)
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
thread_stderr.start()
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")

View File

@ -145,6 +145,7 @@
# the same path you would with an overlay.
legacyPackages = {
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
};
@ -155,6 +156,7 @@
{
default = config.legacyPackages.llamaPackages.llama-cpp;
vulkan = config.packages.default.override { useVulkan = true; };
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
}
// lib.optionalAttrs pkgs.stdenv.isLinux {
opencl = config.packages.default.override { useOpenCL = true; };
@ -168,9 +170,14 @@
};
# Packages exposed in `.#checks` will be built by the CI and by
# `nix flake check`. Currently we expose all packages, but we could
# make more granular choices
checks = config.packages;
# `nix flake check`.
#
# We could test all outputs e.g. as `checks = confg.packages`.
#
# TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
checks = {
inherit (config.packages) default vulkan;
};
};
};
}

View File

@ -2968,7 +2968,7 @@ namespace dpct
#include "ggml-common.h"
static int g_ggml_sycl_debug=0;
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
#define CHECK_TRY_ERROR(expr) \
[&]() { \
@ -12868,6 +12868,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
}
void ggml_backend_sycl_print_sycl_devices() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count);
@ -12925,7 +12926,9 @@ static void ggml_init_sycl() try {
static bool initialized = false;
if (!initialized) {
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
#if defined(GGML_SYCL_F16)
@ -14986,6 +14989,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
SYCL_CHECK(
CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
@ -15016,24 +15022,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
if (no_mixed_dtypes) {
cu_compute_type = dpct::library_data_t::real_half;
cu_data_type = dpct::library_data_t::real_half;
}
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
const void * alpha = &alpha_f32;
const void * beta = &beta_f32;
if (no_mixed_dtypes) {
alpha = &alpha_f16;
beta = &beta_f16;
}
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
// oneMKL open source supports half, half, float, float: datatypes
// when oneMKL open source supports half, half, float, float: datatypes
dst_t = (char *) dst_ddf;
if (no_mixed_dtypes) {
dst_t = (char *) dst_f16.alloc(ne_dst);
nbd2 /= sizeof(float) / sizeof(sycl::half);
nbd3 /= sizeof(float) / sizeof(sycl::half);
}
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
@ -15119,6 +15139,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
}
#endif
if (no_mixed_dtypes) {
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
}
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -16018,6 +16042,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
}
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
for(int i=0;i<max_len;i++) id_list[i] = -1;
if (!g_sycl_gpu_mgr) {
@ -16052,6 +16077,7 @@ catch (sycl::exception const &exc) {
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
size_t description_size) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
dpct::device_info prop;
int device_id = g_sycl_gpu_mgr->gpus[device];
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@ -16066,6 +16092,7 @@ catch (sycl::exception const &exc) {
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
size_t *total) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
ggml_sycl_set_device(device);
/*
@ -16417,7 +16444,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
};
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
ggml_init_sycl();
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
if (device_index>=g_device_count or device_index<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device_index, g_device_count-1);
@ -16787,6 +16815,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
};
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
ggml_init_sycl();
// FIXME: this is not thread safe
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
@ -16859,6 +16888,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
}
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
/* .iface = */ {
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
@ -17155,6 +17185,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
}
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
ggml_init_sycl();
check_allow_gpu_index(device);
@ -17181,6 +17212,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
}
GGML_CALL int ggml_backend_sycl_get_device_count() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
return g_sycl_gpu_mgr->get_gpu_count();
}
@ -17193,16 +17225,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
}
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
return g_sycl_gpu_mgr->get_index(device_id);
}
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
return g_sycl_gpu_mgr->gpus[device_index];
}
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
ggml_init_sycl();
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
if (g_sycl_gpu_mgr) {
delete g_sycl_gpu_mgr;
}
@ -17213,6 +17250,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
}
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
ggml_init_sycl();
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
return;
}

2
ggml.c
View File

@ -3000,7 +3000,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
data_size *= ne[i];
}
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
void * data = view_src != NULL ? view_src->data : NULL;
if (data != NULL) {

View File

@ -9194,6 +9194,7 @@ struct llm_build_context {
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
}
struct ggml_tensor * attn_out = cur;

View File

@ -60,9 +60,9 @@ extern "C" {
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
};
// note: these values should be synchronized with ggml_rope