Merge branch 'master' into ceb/bert-tokenizer-fixes

This commit is contained in:
Jared Van Bortel 2024-04-04 16:02:31 -04:00
commit 0d052cbe39
71 changed files with 30321 additions and 7110 deletions

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
COPY requirements requirements COPY requirements requirements
@ -28,6 +28,8 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA # Enable CUDA
ENV LLAMA_CUDA=1 ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make RUN make

View File

@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make RUN make
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
COPY requirements requirements COPY requirements requirements
@ -15,6 +15,9 @@ WORKDIR /app
COPY . . COPY . .
ENV LLAMA_CURL=1
RUN make RUN make
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -1,5 +1,5 @@
# SRPM for building from source and packaging an RPM for RPM-based distros. # SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

View File

@ -1,5 +1,5 @@
# SRPM for building from source and packaging an RPM for RPM-based distros. # SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

View File

@ -1,5 +1,5 @@
# SRPM for building from source and packaging an RPM for RPM-based distros. # SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git apt-get install -y build-essential git libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
@ -22,11 +22,16 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA # Enable CUDA
ENV LLAMA_CUDA=1 ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ] ENTRYPOINT [ "/server" ]

View File

@ -4,7 +4,7 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y git apt-get install -y git libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
@ -16,11 +16,14 @@ RUN mkdir build && \
echo "LLAMA_SYCL_F16 is set" && \ echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \ fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server cmake --build . --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make RUN make
ENTRYPOINT [ "/app/server" ] ENTRYPOINT [ "/app/server" ]

View File

@ -11,12 +11,16 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
apt update -y && \ apt update -y && \
apt-get install -y vulkan-sdk apt-get install -y vulkan-sdk
# Install cURL
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN mkdir build && \
cd build && \ cd build && \
cmake .. -DLLAMA_VULKAN=1 && \ cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build . --config Release --target server cmake --build . --config Release --target server
# Clean up # Clean up

View File

@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git apt-get install -y build-essential git libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
COPY . . COPY . .
ENV LLAMA_CURL=1
RUN make RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server COPY --from=build /app/server /server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -24,15 +24,15 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
pull_request: pull_request_target:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
schedule: schedule:
- cron: '04 2 * * *' - cron: '04 2 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -42,11 +42,21 @@ jobs:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8 N_USERS: 8
DURATION: 10m DURATION: 10m
strategy:
matrix:
model: [phi-2]
ftype: [q4_0, q8_0, f16]
include:
- model: phi-2
ftype: q4_0
pr_comment_enabled: "true"
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@ -116,7 +126,7 @@ jobs:
--scenario script.js \ --scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \ --duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \ --hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \ --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
--model-path-prefix /models \ --model-path-prefix /models \
--parallel ${{ env.N_USERS }} \ --parallel ${{ env.N_USERS }} \
-ngl 33 \ -ngl 33 \
@ -134,7 +144,7 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: benchmark-results name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
compression-level: 9 compression-level: 9
path: | path: |
examples/server/bench/*.jpg examples/server/bench/*.jpg
@ -143,11 +153,10 @@ jobs:
- name: Commit status - name: Commit status
uses: Sibz/github-status-action@v1 uses: Sibz/github-status-action@v1
continue-on-error: true # If not authorized on external repo
with: with:
authToken: ${{secrets.GITHUB_TOKEN}} authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-baseline context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
description: | description: |
${{ env.BENCH_RESULTS }} ${{ env.BENCH_RESULTS }}
state: 'success' state: 'success'
@ -204,11 +213,19 @@ jobs:
- name: Comment PR - name: Comment PR
uses: mshick/add-pr-comment@v2 uses: mshick/add-pr-comment@v2
id: comment_pr id: comment_pr
if: ${{ github.event.pull_request != '' }} if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
with: with:
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
message: | message: |
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 <p align="center">
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
</p>
<details>
<summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
@ -216,9 +233,6 @@ jobs:
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }} - ${{ env.BENCH_GRAPH_XLABEL }}
<details>
<summary>Time series</summary>
<p align="center"> <p align="center">

View File

@ -16,7 +16,7 @@ on:
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
env: env:
@ -31,7 +31,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -76,10 +76,10 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip name: llama-bin-macos-arm64.zip
macOS-latest-cmake-x64: macOS-latest-cmake-x64:
runs-on: macos-latest runs-on: macos-latest
@ -87,7 +87,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -132,10 +132,10 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip name: llama-bin-macos-x64.zip
ubuntu-focal-make: ubuntu-focal-make:
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
@ -146,7 +146,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -158,7 +158,7 @@ jobs:
with: with:
node-version: "20" node-version: "20"
- uses: actions/setup-python@v4 - uses: actions/setup-python@v5
with: with:
python-version: "3.11" python-version: "3.11"
@ -181,7 +181,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -203,7 +203,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -249,7 +249,7 @@ jobs:
# steps: # steps:
# - name: Clone # - name: Clone
# id: checkout # id: checkout
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Dependencies # - name: Dependencies
# id: depends # id: depends
@ -283,7 +283,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -311,7 +311,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -357,7 +357,7 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -398,7 +398,7 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -418,7 +418,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -449,7 +449,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -593,7 +593,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
@ -723,10 +723,10 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip name: llama-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-latest
@ -739,7 +739,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
@ -779,10 +779,10 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
run: | run: |
@ -793,10 +793,10 @@ jobs:
- name: Upload Cuda runtime - name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
windows-latest-cmake-sycl: windows-latest-cmake-sycl:
runs-on: windows-latest runs-on: windows-latest
@ -812,7 +812,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
@ -844,17 +844,17 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
path: | path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@ -864,7 +864,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up JDK - name: Set up JDK
uses: actions/setup-java@v3 uses: actions/setup-java@v3
@ -887,7 +887,7 @@ jobs:
# runs-on: macos-12 # runs-on: macos-12
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Build # - name: Build
# uses: cross-platform-actions/action@v0.19.0 # uses: cross-platform-actions/action@v0.19.0
@ -918,7 +918,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
@ -937,7 +937,7 @@ jobs:
- name: Download artifacts - name: Download artifacts
id: download-artifact id: download-artifact
uses: actions/download-artifact@v3 uses: actions/download-artifact@v4
- name: Create release - name: Create release
id: create_release id: create_release
@ -978,7 +978,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -1002,7 +1002,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -1026,7 +1026,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -1056,7 +1056,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Add msbuild to PATH # - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1 # uses: microsoft/setup-msbuild@v1
@ -1072,7 +1072,7 @@ jobs:
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
# #
# - name: Upload binaries # - name: Upload binaries
# uses: actions/upload-artifact@v1 # uses: actions/upload-artifact@v4
# with: # with:
# name: llama-bin-${{ matrix.arch }} # name: llama-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }} # path: build/bin/${{ matrix.build }}
@ -1095,7 +1095,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Add msbuild to PATH # - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1 # uses: microsoft/setup-msbuild@v1
@ -1127,7 +1127,7 @@ jobs:
# #
# - name: Upload binaries # - name: Upload binaries
# if: matrix.blas == 'ON' # if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v1 # uses: actions/upload-artifact@v4
# with: # with:
# name: llama-blas-bin-${{ matrix.arch }} # name: llama-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }} # path: build/bin/${{ matrix.build }}
@ -1141,7 +1141,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v3 # uses: actions/checkout@v4
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |

View File

@ -6,7 +6,7 @@ env:
GGML_N_THREADS: 1 GGML_N_THREADS: 1
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
run: | run: |

View File

@ -16,7 +16,7 @@ on:
- master - master
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -46,7 +46,7 @@ jobs:
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v2 uses: docker/setup-qemu-action@v2

View File

@ -15,13 +15,13 @@ on:
- master - master
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
editorconfig: editorconfig:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- uses: editorconfig-checker/action-editorconfig-checker@main - uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker - run: editorconfig-checker

View File

@ -24,9 +24,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: '3.9.x' python-version: '3.9.x'
- name: Install dependencies - name: Install dependencies

View File

@ -18,7 +18,7 @@ on:
paths: ['**/*.nix', 'flake.lock'] paths: ['**/*.nix', 'flake.lock']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:

View File

@ -9,7 +9,7 @@ on:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:

View File

@ -17,7 +17,7 @@ on:
- 'requirements/*.txt' - 'requirements/*.txt'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -26,9 +26,9 @@ jobs:
name: check-requirements name: check-requirements
steps: steps:
- name: Check out source repository - name: Check out source repository
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up Python environment - name: Set up Python environment
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: "3.11" python-version: "3.11"
- name: Run check-requirements.sh script - name: Run check-requirements.sh script

View File

@ -3,7 +3,7 @@ name: flake8 Lint
on: [push, pull_request] on: [push, pull_request]
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -12,9 +12,9 @@ jobs:
name: Lint name: Lint
steps: steps:
- name: Check out source repository - name: Check out source repository
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up Python environment - name: Set up Python environment
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: "3.11" python-version: "3.11"
- name: flake8 Lint - name: flake8 Lint

View File

@ -4,6 +4,10 @@ name: Server
on: on:
workflow_dispatch: # allows manual triggering workflow_dispatch: # allows manual triggering
inputs: inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests: slow_tests:
description: 'Run slow tests' description: 'Run slow tests'
required: true required: true
@ -11,15 +15,15 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request: pull_request_target:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule: schedule:
- cron: '0 0 * * *' - cron: '2 4 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -44,25 +48,45 @@ jobs:
options: --cpus 4 options: --cpus 4
steps: steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
apt-get update apt-get update
apt-get -y install \ apt-get -y install \
build-essential \ build-essential \
xxd \
git \ git \
cmake \ cmake \
python3-pip \ python3-pip \
curl \
wget \ wget \
language-pack-en \ language-pack-en \
libcurl4-openssl-dev libcurl4-openssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Verify server deps
id: verify_server_deps
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server
git ls-files --others --modified
git status
./deps.sh
git status
not_ignored_files="$(git ls-files --others --modified)"
echo "Modified files: ${not_ignored_files}"
if [ -n "${not_ignored_files}" ]; then
echo "Repository is dirty or server deps are not built as expected"
echo "${not_ignored_files}"
exit 1
fi
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -102,7 +126,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0

View File

@ -7,7 +7,7 @@ on:
- master - master
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -18,7 +18,7 @@ jobs:
runs-on: [ubuntu-latest, macos-latest, windows-latest] runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }} runs-on: ${{ matrix.runs-on }}
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0

View File

@ -113,6 +113,9 @@ option(LLAMA_METAL "llama: use Metal"
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF)
set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -250,6 +253,16 @@ if (LLAMA_METAL)
set(XC_FLAGS -O3) set(XC_FLAGS -O3)
endif() endif()
# Append macOS metal versioning flags
if (LLAMA_METAL_MACOSX_VERSION_MIN)
message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
endif()
if (LLAMA_METAL_STD)
message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
endif()
add_custom_command( add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air

View File

@ -867,6 +867,10 @@ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)

View File

@ -3,7 +3,7 @@
- [Background](#background) - [Background](#background)
- [News](#news) - [News](#news)
- [OS](#os) - [OS](#os)
- [Intel GPU](#intel-gpu) - [Supported Devices](#supported-devices)
- [Docker](#docker) - [Docker](#docker)
- [Linux](#linux) - [Linux](#linux)
- [Windows](#windows) - [Windows](#windows)
@ -14,17 +14,25 @@
## Background ## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. **SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL. ### Llama.cpp + SYCL
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
The llama.cpp for SYCL is used to support Intel GPUs. The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). *Upcoming support: AMD GPUs*.
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
## News ## News
@ -51,9 +59,16 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|Windows|Support|Windows 11| |Windows|Support|Windows 11|
## Intel GPU ## Supported devices
### Verified ### Intel GPUs
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
```sh
source /opt/intel/oneapi/setvars.sh
```
- **Tested devices**
|Intel GPU| Status | Verified Model| |Intel GPU| Status | Verified Model|
|-|-|-| |-|-|-|
@ -63,198 +78,229 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use. *Notes:*
### Memory - Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
The memory is a limitation to run LLM on GPUs. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`. - If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+. ### Nvidia GPUs
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. - **Tested devices**
## Nvidia GPU |Nvidia GPU| Status | Verified Model|
### Verified
|Intel GPU| Status | Verified Model|
|-|-|-| |-|-|-|
|Ampere Series| Support| A100| |Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
### oneMKL for CUDA *Notes:*
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
The current oneMKL release does not contain the oneMKL cuBlas backend. - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
As a result for Nvidia GPU's oneMKL must be built from source.
```
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir build
cd build
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
ninja
// Add paths as necessary
```
## Docker ## Docker
The docker build option is currently limited to *intel GPU* targets.
Note: ### Build image
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
```sh ```sh
# For F16: # Using FP16
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
``` ```
### Run *Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container
```sh ```sh
# Firstly, find all the DRI cards: # First, find all the DRI cards
ls -la /dev/dri ls -la /dev/dri
# Then, pick the card that you want to use. # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
# For example with "/dev/dri/card1"
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
``` ```
*Notes:*
- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux ## Linux
### Setup Environment ### I. Setup Environment
1. Install Intel GPU driver. 1. **Install GPU drivers**
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html). - **Intel GPU**
Note: for iGPU, please install the client GPU driver. Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
b. Add user to group: video, render. *Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
Once installed, add the user(s) to the `video` and `render` groups.
```sh ```sh
sudo usermod -aG render username sudo usermod -aG render $USER
sudo usermod -aG video username sudo usermod -aG video $USER
``` ```
Note: re-login to enable it. *Note*: logout/re-login for the changes to take effect.
c. Check Verify installation through `clinfo`:
```sh ```sh
sudo apt install clinfo sudo apt install clinfo
sudo clinfo -l sudo clinfo -l
``` ```
Output (example): Sample output:
``` ```sh
Platform #0: Intel(R) OpenCL Graphics Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics `-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
``` ```
2. Install Intel® oneAPI Base toolkit. - **Nvidia GPU**
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
Installation can be verified by running the following:
```sh
nvidia-smi
```
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
```
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
```
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder. 2. **Install Intel® oneAPI Base toolkit**
b. Check - **Base installation**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
- **Adding support to Nvidia GPUs**
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh ```sh
source /opt/intel/oneapi/setvars.sh git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir -p buildWithCublas && cd buildWithCublas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
make
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls sycl-ls
``` ```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. - **Intel GPU**
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
Output (example):
``` ```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
``` ```
2. Build locally: - **Nvidia GPU**
Note: Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. ```
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
```
### II. Build llama.cpp
#### Intel GPU
```sh ```sh
mkdir -p build # Export relevant ENV variables
cd build
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
# For FP16: # Build LLAMA with MKL BLAS acceleration for intel GPU
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON mkdir -p build && cd build
# Or, for FP32: # Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# For Nvidia GPUs
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Build example/main only
#cmake --build . --config Release --target main
# Or, build all binary
cmake --build . --config Release -v
cd ..
``` ```
or #### Nvidia GPU
```sh ```sh
./examples/sycl/build.sh # Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
``` ```
### Run ### III. Run the inference
1. Put model file to folder **models** 1. Retrieve and prepare model
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment 2. Enable oneAPI running environment
``` ```sh
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
``` ```
3. List device ID 3. List devices information
Run without parameter: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh ```sh
./build/bin/ls-sycl-device ./build/bin/ls-sycl-device
# or running the "main" executable and look at the output log:
./build/bin/main
``` ```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
Check the ID in startup log, like:
``` ```
found 6 SYCL devices: found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| | | | | |Compute |Max compute|Max work|Max sub| |
@ -270,15 +316,15 @@ found 6 SYCL devices:
|Attribute|Note| |Attribute|Note|
|-|-| |-|-|
|compute capability 1.3|Level-zero running time, recommended | |compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| |compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
4. Device selection and execution of llama.cpp 4. Launch inference
There are two device selection modes: There are two device selection modes:
- Single device: Use one device assigned by user. - Single device: Use one device target specified by the user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units. - Multiple devices: Automatically select the devices with the same largest Max compute-units.
|Device selection|Parameter| |Device selection|Parameter|
|-|-| |-|-|
@ -303,74 +349,64 @@ or run by script:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
``` ```
or run by script:
Otherwise, you can run the script:
```sh ```sh
./examples/sycl/run_llama2.sh ./examples/sycl/run_llama2.sh
``` ```
Note: *Notes:*
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. - By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
5. Verify the device ID in output
Verify to see if the selected GPU is shown in the output, like:
```
detect 1 SYCL GPUs: [0] with top Max compute units:512 detect 1 SYCL GPUs: [0] with top Max compute units:512
``` ```
Or Or
``` ```sh
use 1 SYCL GPUs: [0] with Max compute units:512 use 1 SYCL GPUs: [0] with Max compute units:512
``` ```
## Windows ## Windows
### Setup Environment ### I. Setup Environment
1. Install Intel GPU driver. 1. Install GPU driver
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Note: **The driver is mandatory for compute function**. 2. Install Visual Studio
2. Install Visual Studio. If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows. 3. Install Intel® oneAPI Base toolkit
3. Install Intel® oneAPI Base toolkit. The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**. Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Enable oneAPI running environment: b. Enable oneAPI running environment:
- In Search, input 'oneAPI'. - Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022" - On the command prompt, enable the runtime environment with the following:
- In Run:
In CMD:
``` ```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
``` ```
c. Check GPU c. Verify installation
In oneAPI command line: In the oneAPI command line, run the following to print the available SYCL devices:
``` ```
sycl-ls sycl-ls
``` ```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
Output (example): Output (example):
``` ```
@ -380,7 +416,7 @@ Output (example):
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
``` ```
4. Install cmake & make 4. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/ a. Download & install cmake for Windows: https://cmake.org/download/
@ -390,76 +426,53 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
- Extract `w64devkit` on your pc. - Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`. - Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
### Build locally: ### II. Build llama.cpp
In oneAPI command line window: On the oneAPI command line window, step into the llama.cpp main directory and run the following:
``` ```
mkdir -p build mkdir -p build
cd build cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: for FP16 cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP32 make
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..
``` ```
or Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
```sh
```
.\examples\sycl\win-build-sycl.bat .\examples\sycl\win-build-sycl.bat
``` ```
Note: *Notes:*
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. - By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
### Run ### III. Run the inference
1. Put model file to folder **models** 1. Retrieve and prepare model
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment 2. Enable oneAPI running environment
- In Search, input 'oneAPI'. On the oneAPI command line window, run the following and step into the llama.cpp directory:
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
``` ```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
``` ```
3. List device ID 3. List devices information
Run without parameter: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
``` ```
build\bin\ls-sycl-device.exe build\bin\ls-sycl-device.exe
or
build\bin\main.exe
``` ```
Check the ID in startup log, like: The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
``` ```
found 6 SYCL devices: found 6 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| | | | | |Compute |Max compute|Max work|Max sub| |
@ -480,7 +493,7 @@ found 6 SYCL devices:
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Device selection and execution of llama.cpp 4. Launch inference
There are two device selection modes: There are two device selection modes:
@ -505,7 +518,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be
``` ```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
``` ```
or run by script: Otherwise, run the following wrapper script:
``` ```
.\examples\sycl\win-run-llama2.bat .\examples\sycl\win-run-llama2.bat
@ -513,19 +526,14 @@ or run by script:
Note: Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. - By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
5. Verify the device ID in output
Verify to see if the selected GPU is shown in the output, like:
```
detect 1 SYCL GPUs: [0] with top Max compute units:512 detect 1 SYCL GPUs: [0] with top Max compute units:512
``` ```
Or Or
``` ```sh
use 1 SYCL GPUs: [0] with Max compute units:512 use 1 SYCL GPUs: [0] with Max compute units:512
``` ```
@ -535,64 +543,54 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|Name|Value|Function| |Name|Value|Function|
|-|-|-| |-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.| |LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.| |LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| |LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path| |CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
#### Running
#### Runtime
|Name|Value|Function| |Name|Value|Function|
|-|-|-| |-|-|-|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer| |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issue ## Known Issues
- Hang during startup - Hanging during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
Solution: add **--no-mmap** or **--mmap 0**. - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
- Split-mode: [row] is not supported - `Split-mode:[row]` is not supported.
It's on developing.
## Q&A ## Q&A
Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment. - Potential cause: Unavailable oneAPI installation or not set ENV variables.
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`. - General compiler error:
- In Windows, no result, not error. - Remove build folder or try a clean-build.
Miss to enable oneAPI running environment. - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
- Meet compile error. Please double-check with `sudo sycl-ls`.
Remove folder **build** and try again. If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
``` ```
sudo usermod -aG render username sudo usermod -aG render $USER
sudo usermod -aG video username sudo usermod -aG video $USER
``` ```
Otherwise, please double-check the GPU driver installation steps.
Then **relogin**. ### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
If you do not see it, please check the installation GPU steps again.
## Todo ## Todo

View File

@ -18,12 +18,13 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics ### Hot topics
- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017 - Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
---- ----
@ -115,7 +116,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma) - [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba) - [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
**Multimodal models:** **Multimodal models:**
@ -139,6 +142,7 @@ Typically finetunes of the base models below are supported as well.
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@ -175,6 +179,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary) - [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
--- ---
@ -632,15 +640,6 @@ Building the program with BLAS support may lead to some performance improvements
- #### Vulkan - #### Vulkan
> [!WARNING]
>
> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
>
> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
**With docker**: **With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container. You don't need to install Vulkan SDK. It will be installed inside the container.

67
SECURITY.md Normal file
View File

@ -0,0 +1,67 @@
# Security Policy
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely
### Untrusted models
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
> [!NOTE]
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
### Untrusted inputs
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
For maximum security when handling untrusted inputs, you may need to employ the following:
* Sandboxing: Isolate the environment where the inference happens.
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
* Validation: Enforce strict rules on allowed characters and data types.
* Filtering: Remove potentially malicious scripts or code fragments.
* Encoding: Convert special characters into safe representations.
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
### Data privacy
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Encrypt your data if sending it over the network.
### Multi-Tenant environments
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View File

@ -575,7 +575,7 @@ function gg_run_embd_bge_small {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin

View File

@ -1928,11 +1928,6 @@ struct llama_model * llama_load_model_from_url(
return NULL; return NULL;
} }
if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
}
if (!llama_download_file(curl, model_url, path_model)) { if (!llama_download_file(curl, model_url, path_model)) {
return NULL; return NULL;
} }

View File

@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
from convert import LlamaHfVocab from convert import LlamaHfVocab, permute
###### MODEL DEFINITIONS ###### ###### MODEL DEFINITIONS ######
@ -326,8 +326,7 @@ class Model(ABC):
toktypes: list[int] = [] toktypes: list[int] = []
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
print(f'Error: Missing {tokenizer_path}', file=sys.stderr) raise FileNotFoundError(f"File not found: {tokenizer_path}")
sys.exit(1)
tokenizer = SentencePieceProcessor(str(tokenizer_path)) tokenizer = SentencePieceProcessor(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
@ -514,6 +513,17 @@ class BloomModel(Model):
class MPTModel(Model): class MPTModel(Model):
model_arch = gguf.MODEL_ARCH.MPT model_arch = gguf.MODEL_ARCH.MPT
def set_vocab(self):
try:
self._set_vocab_gpt2()
except Exception:
# Fallback for SEA-LION model
self._set_vocab_sentencepiece()
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_pad_token_id(3)
self.gguf_writer.add_eos_token_id(1)
self.gguf_writer.add_unk_token_id(0)
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layers"] block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
@ -527,7 +537,10 @@ class MPTModel(Model):
self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_layer_norm_eps(1e-5)
if self.hparams["attn_config"]["clip_qkv"] is not None: if self.hparams["attn_config"]["clip_qkv"] is not None:
self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) if self.hparams["attn_config"]["alibi"]:
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
else:
self.gguf_writer.add_max_alibi_bias(0.0)
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
@ -776,6 +789,148 @@ class BaichuanModel(Model):
return weights[r * n_part:r * n_part + r, ...] return weights[r * n_part:r * n_part + r, ...]
@Model.register("XverseForCausalLM")
class XverseModel(Model):
model_arch = gguf.MODEL_ARCH.XVERSE
def set_vocab(self):
assert (self.dir_model / "tokenizer.json").is_file()
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
for token_id in range(vocab_size):
token_text = reverse_vocab[token_id].encode('utf-8')
# replace "\x00" to string with length > 0
if token_text == b"\x00":
toktype = gguf.TokenType.BYTE # special
token_text = f"<{token_text}>".encode('utf-8')
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
toktype = gguf.TokenType.BYTE # special
elif reverse_vocab[token_id] in added_vocab:
if tokenizer.added_tokens_decoder[token_id].special:
toktype = gguf.TokenType.CONTROL
else:
toktype = gguf.TokenType.USER_DEFINED
else:
toktype = gguf.TokenType.NORMAL
tokens.append(token_text)
toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
hf_repo = self.hparams.get("_name_or_path", "")
ctx_length = 0
if "max_sequence_length" in self.hparams:
ctx_length = self.hparams["max_sequence_length"]
elif "max_position_embeddings" in self.hparams:
ctx_length = self.hparams["max_position_embeddings"]
elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
def write_tensors(self):
# Collect tensors from generator object
model_kv = dict(self.get_tensors())
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
for name, data_torch in model_kv.items():
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
# HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
if name.endswith(("k_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
@Model.register("FalconForCausalLM", "RWForCausalLM") @Model.register("FalconForCausalLM", "RWForCausalLM")
class FalconModel(Model): class FalconModel(Model):
model_arch = gguf.MODEL_ARCH.FALCON model_arch = gguf.MODEL_ARCH.FALCON
@ -1055,12 +1210,120 @@ class StableLMModel(Model):
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
@Model.register("MixtralForCausalLM") @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class MixtralModel(Model): class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() try:
self. _set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_llama_hf()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
# Same as super class, but permuting q_proj, k_proj
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_head = self.hparams.get("num_attention_heads")
n_kv_head = self.hparams.get("num_key_value_heads")
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.numpy()
if name.endswith("q_proj.weight"):
data = permute(data, n_head, n_head)
if name.endswith("k_proj.weight"):
data = permute(data, n_head, n_kv_head)
data = data.squeeze()
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in range(1, 4):
full = True
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue
datas = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
datas.append(experts[ename])
del experts[ename]
data = np.stack(datas, axis=0)
data_dtype = data.dtype
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)
merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
continue
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# 1d tensors need to be converted to float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts.keys()}")
@Model.register("GrokForCausalLM") @Model.register("GrokForCausalLM")
@ -1077,6 +1340,92 @@ class GrokModel(Model):
super().set_gguf_parameters() super().set_gguf_parameters()
self.gguf_writer.add_name("Grok") self.gguf_writer.add_name("Grok")
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# process the experts separately
if name.find(".moe.") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in ["linear", "linear_1", "linear_v"]:
full = True
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue
datas = []
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
datas.append(experts[ename])
del experts[ename]
data = np.stack(datas, axis=0)
data_dtype = data.dtype
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
continue
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
@Model.register("MiniCPMForCausalLM") @Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model): class MiniCPMModel(Model):

View File

@ -828,6 +828,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
def load() -> Tensor:
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
s = lazy_tensors[0].shape.copy()
s.insert(0, len(lazy_tensors))
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
# Functionality that simulates `torch.load` but where individual tensors are # Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once. # only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing: # PyTorch can't do this natively as of time of writing:
@ -1246,6 +1255,22 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tmp = model tmp = model
# merge experts into one tensor
if params.n_experts and params.n_experts > 0:
for i_l in range(params.n_layer):
for w in range(1, 4):
experts = []
for e in range(params.n_experts):
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
# HF models permut or pack some of the tensors, so we need to undo that # HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:

View File

@ -0,0 +1,5 @@
set(TARGET gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -0,0 +1,132 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h"
#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return true;
}
}
error_pos = pos;
error_msg = "Unexpected end of input";
return false;
}
static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
fprintf(stdout, "Input string is invalid according to the grammar.\n");
fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
fprintf(stdout, "\n");
fprintf(stdout, "Input string:\n");
fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
if (error_pos < input_str.size()) {
fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
if (error_pos+1 < input_str.size()) {
fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
}
fprintf(stdout, "\033[0m\n");
}
}
int main(int argc, char** argv) {
if (argc != 3) {
fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
return 1;
}
const std::string grammar_filename = argv[1];
const std::string input_filename = argv[2];
// Read the GBNF grammar file
FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
if (!grammar_file) {
fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
return 1;
}
fseek(grammar_file, 0, SEEK_END);
size_t grammar_size = ftell(grammar_file);
fseek(grammar_file, 0, SEEK_SET);
std::string grammar_str(grammar_size, ' ');
fread(&grammar_str[0], 1, grammar_size, grammar_file);
fclose(grammar_file);
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Read the input file
FILE* input_file = fopen(input_filename.c_str(), "r");
if (!input_file) {
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
return 1;
}
fseek(input_file, 0, SEEK_END);
size_t input_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
std::string input_str(input_size, ' ');
fread(&input_str[0], 1, input_size, input_file);
fclose(input_file);
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n");
} else {
print_error_message(input_str, error_pos, error_msg);
}
// Clean up
llama_grammar_free(grammar);
return 0;
}

View File

@ -28,9 +28,11 @@ enum split_operation : uint8_t {
struct split_params { struct split_params {
split_operation operation = SPLIT_OP_SPLIT; split_operation operation = SPLIT_OP_SPLIT;
size_t n_bytes_split = 0;
int n_split_tensors = 128; int n_split_tensors = 128;
std::string input; std::string input;
std::string output; std::string output;
bool dry_run = false;
}; };
static void split_print_usage(const char * executable) { static void split_print_usage(const char * executable) {
@ -41,15 +43,36 @@ static void split_print_usage(const char * executable) {
printf("Apply a GGUF operation on IN to OUT."); printf("Apply a GGUF operation on IN to OUT.");
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n"); printf(" --version show version and build info\n");
printf(" --split split GGUF to multiple GGUF (default)\n"); printf(" --split split GGUF to multiple GGUF (enabled by default)\n");
printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors); printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n"); printf("\n");
} }
static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) { // return convert string, for example "128M" or "4G" to number of bytes
static size_t split_str_to_n_bytes(std::string str) {
size_t n_bytes = 0;
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = n * 1024 * 1024; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = n * 1024 * 1024 * 1024; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
if (n <= 0) {
throw std::invalid_argument("error: size must be a positive value");
}
return n_bytes;
}
static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
std::string arg; std::string arg;
const std::string arg_prefix = "--"; const std::string arg_prefix = "--";
bool invalid_param = false; bool invalid_param = false;
@ -62,6 +85,8 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
} }
bool arg_found = false; bool arg_found = false;
bool is_op_set = false;
bool is_mode_set = false;
if (arg == "-h" || arg == "--help") { if (arg == "-h" || arg == "--help") {
split_print_usage(argv[0]); split_print_usage(argv[0]);
exit(0); exit(0);
@ -71,23 +96,46 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0); exit(0);
} }
if (arg == "--dry-run") {
arg_found = true;
params.dry_run = true;
}
if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
}
if (arg == "--merge") { if (arg == "--merge") {
arg_found = true; arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_MERGE; params.operation = SPLIT_OP_MERGE;
} }
if (arg == "--split") { if (arg == "--split") {
arg_found = true; arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_SPLIT; params.operation = SPLIT_OP_SPLIT;
} }
if (is_mode_set) {
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
}
if (arg == "--split-max-tensors") { if (arg == "--split-max-tensors") {
if (++arg_idx >= argc) { if (++arg_idx >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
arg_found = true; arg_found = true;
is_mode_set = true;
params.n_split_tensors = atoi(argv[arg_idx]); params.n_split_tensors = atoi(argv[arg_idx]);
} }
if (arg == "--split-max-size") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
is_mode_set = true;
params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
}
if (!arg_found) { if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg); throw std::invalid_argument("error: unknown argument: " + arg);
@ -99,24 +147,17 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
} }
if (argc - arg_idx < 2) { if (argc - arg_idx < 2) {
printf("%s: bad arguments\n", argv[0]); throw std::invalid_argument("error: bad arguments");
split_print_usage(argv[0]);
return false;
} }
params.input = argv[arg_idx++]; params.input = argv[arg_idx++];
params.output = argv[arg_idx++]; params.output = argv[arg_idx++];
return true;
} }
static bool split_params_parse(int argc, const char ** argv, split_params & params) { static bool split_params_parse(int argc, const char ** argv, split_params & params) {
bool result = true; bool result = true;
try { try {
if (!split_params_parse_ex(argc, argv, params)) { split_params_parse_ex(argc, argv, params);
split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
} }
catch (const std::invalid_argument & ex) { catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what()); fprintf(stderr, "%s\n", ex.what());
@ -140,15 +181,11 @@ struct split_strategy {
struct ggml_context * ctx_meta = NULL; struct ggml_context * ctx_meta = NULL;
const int n_tensors; const int n_tensors;
const int n_split; // one ctx_out per one output file
int i_split = 0; std::vector<struct gguf_context *> ctx_outs;
int i_tensor = 0; // temporary buffer for reading in tensor data
std::vector<uint8_t> read_buf;
std::vector<uint8_t> read_data;
struct gguf_context * ctx_out;
std::ofstream fout;
split_strategy(const split_params & params, split_strategy(const split_params & params,
std::ifstream & f_input, std::ifstream & f_input,
@ -158,79 +195,141 @@ struct split_strategy {
f_input(f_input), f_input(f_input),
ctx_gguf(ctx_gguf), ctx_gguf(ctx_gguf),
ctx_meta(ctx_meta), ctx_meta(ctx_meta),
n_tensors(gguf_get_n_tensors(ctx_gguf)), n_tensors(gguf_get_n_tensors(ctx_gguf)) {
n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1;
struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&]() {
i_split++;
if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE);
}
ctx_outs.push_back(ctx_out);
}
ctx_out = gguf_init_empty();
// Save all metadata in first split only
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
};
// initialize ctx_out for the first split
new_ctx_out();
// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
for (int i = 0; i < n_tensors; ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
// calculate the "imaginary" size = the current size + next tensor size
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out();
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;
}
gguf_add_tensor(ctx_out, t);
} }
bool should_split() const { // push the last ctx_out
return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; ctx_outs.push_back(ctx_out);
// set the correct n_split for all ctx_out
for (auto & ctx : ctx_outs) {
gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
}
} }
void split_start() { ~split_strategy() {
ctx_out = gguf_init_empty(); for (auto & ctx_out : ctx_outs) {
gguf_free(ctx_out);
// Save all metadata in first split only
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
} }
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
// populate the original tensors, so we get an initial metadata
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
gguf_add_tensor(ctx_out, meta);
}
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
fprintf(stderr, "%s: %s ...", __func__, split_path);
fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto meta_size = gguf_get_meta_size(ctx_out);
// placeholder for the meta data
::zeros(fout, meta_size);
i_split++;
} }
void next_tensor() { bool should_split(int i_tensor, size_t next_size) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); if (params.n_bytes_split > 0) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); // split by max size per file
auto n_bytes = ggml_nbytes(t); return next_size > params.n_bytes_split;
} else {
if (read_data.size() < n_bytes) { // split by number of tensors per file
read_data.resize(n_bytes); return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
} }
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
t->data = read_data.data();
// write tensor data + padding
fout.write((const char *)t->data, n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
i_tensor++;
} }
void split_end() { void print_info() {
// go back to beginning of file and write the updated metadata printf("n_split: %ld\n", ctx_outs.size());
fout.seekp(0); int i_split = 0;
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out)); for (auto & ctx_out : ctx_outs) {
gguf_get_meta_data(ctx_out, data.data()); // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
fout.write((const char *)data.data(), data.size()); size_t total_size = gguf_get_meta_size(ctx_out);
for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
total_size = total_size / 1024 / 1024; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
}
fout.close(); void write() {
gguf_free(ctx_out); int i_split = 0;
int n_split = ctx_outs.size();
for (auto & ctx_out : ctx_outs) {
// construct file path
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
fprintf(stderr, "\033[3Ddone\n"); // open the output file
printf("Writing file %s ... ", split_path);
fflush(stdout);
std::ofstream fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// write metadata
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
// write tensors
for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
// read tensor meta and prepare buffer
const char * t_name = gguf_get_tensor_name(ctx_out, i);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
auto n_bytes = ggml_nbytes(t);
read_buf.resize(n_bytes);
// calculate offset
auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
// copy tensor from input to output file
copy_file_to_file(f_input, fout, offset, n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
printf("done\n");
// close the file
fout.close();
i_split++;
}
}
void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
// TODO: detect OS and use copy_file_range() here for better performance
if (read_buf.size() < len) {
read_buf.resize(len);
}
f_in.seekg(in_offset);
f_in.read((char *)read_buf.data(), len);
f_out.write((const char *)read_buf.data(), len);
} }
}; };
@ -254,32 +353,22 @@ static void gguf_split(const split_params & split_params) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// prepare the strategy
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
int n_split = strategy.ctx_outs.size();
strategy.print_info();
char first_split_path[PATH_MAX] = {0}; if (!split_params.dry_run) {
llama_split_path(first_split_path, sizeof(first_split_path), // write all output splits
split_params.output.c_str(), strategy.i_split, strategy.n_split); strategy.write();
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
__func__, split_params.input.c_str(),
first_split_path,
split_params.n_split_tensors);
strategy.split_start();
while (strategy.i_tensor < strategy.n_tensors) {
strategy.next_tensor();
if (strategy.should_split()) {
strategy.split_end();
strategy.split_start();
}
} }
strategy.split_end();
// done, clean up
gguf_free(ctx_gguf); gguf_free(ctx_gguf);
f_input.close(); f_input.close();
fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
__func__, strategy.n_split, strategy.n_tensors); __func__, n_split, strategy.n_tensors);
} }
static void gguf_merge(const split_params & split_params) { static void gguf_merge(const split_params & split_params) {
@ -448,10 +537,6 @@ static void gguf_merge(const split_params & split_params) {
} }
int main(int argc, const char ** argv) { int main(int argc, const char ** argv) {
if (argc < 3) {
split_print_usage(argv[0]);
}
split_params params; split_params params;
split_params_parse(argc, argv, params); split_params_parse(argc, argv, params);

View File

@ -98,35 +98,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) { if (t->op == GGML_OP_MUL_MAT_ID) {
const int idx = ((int32_t *) t->op_params)[0]; const int idx = ((int32_t *) t->op_params)[0];
const int n_as = ((int32_t *) t->op_params)[1]; const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
// the top-k selected expert ids are stored in the src0 tensor // the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy src0 to host, because it is small // for simplicity, always copy ids to host, because it is small
// take into account that src0 is not contiguous! // take into account that ids is not contiguous!
GGML_ASSERT(src0->ne[1] == src1->ne[1]); GGML_ASSERT(ids->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int))); GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(src0)/sizeof(int)); m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0)); ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
// loop over all possible experts, regardless if they are used or not in the batch // loop over all possible experts, regardless if they are used or not in the batch
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) { for (int ex = 0; ex < n_as; ++ex) {
src0 = t->src[2 + ex]; size_t e_start = ex*src1->ne[0];
wname = filter_tensor_name(src0->name);
auto& e = m_stats[wname];
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0], 0); e.values.resize(src1->ne[0]*n_as, 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]) { else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false); exit(1); //GGML_ASSERT(false);
} }
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) { if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
} }
@ -136,7 +139,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (excur != ex) continue; if (excur != ex) continue;
const float * x = data + row * src1->ne[0]; const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
} }
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {

View File

@ -116,13 +116,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
std::ifstream in(imatrix_file.c_str(), std::ios::binary); std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
return; exit(1);
} }
int n_entries; int n_entries;
in.read((char *)&n_entries, sizeof(n_entries)); in.read((char *)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) { if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
return; exit(1);
} }
for (int i = 0; i < n_entries; ++i) { for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len)); int len; in.read((char *)&len, sizeof(len));
@ -130,11 +130,11 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
in.read((char *)name_as_vec.data(), len); in.read((char *)name_as_vec.data(), len);
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
return; exit(1);
} }
name_as_vec[len] = 0; name_as_vec[len] = 0;
std::string name{name_as_vec.data()}; std::string name{name_as_vec.data()};
auto & e = imatrix_data[std::move(name)]; auto & e = imatrix_data[name];
int ncall; int ncall;
in.read((char *)&ncall, sizeof(ncall)); in.read((char *)&ncall, sizeof(ncall));
int nval; int nval;
@ -142,18 +142,22 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
if (in.fail() || nval < 1) { if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n", __func__, i); printf("%s: failed reading number of values for entry %d\n", __func__, i);
imatrix_data = {}; imatrix_data = {};
return; exit(1);
} }
e.resize(nval); e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float)); in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i); printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {}; imatrix_data = {};
return; exit(1);
} }
if (ncall > 0) { if (ncall > 0) {
for (auto& v : e) v /= ncall; for (auto& v : e) v /= ncall;
} }
if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
} }
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str()); printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
} }

View File

@ -16,52 +16,50 @@ The project is under active development, and we are [looking for feedback and co
**Command line options:** **Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend. - `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused). - `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. - `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies that help on some NUMA systems - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes - `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl - `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
if run without this previously, it is recommended to drop the system page cache before using this page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
see https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that help on some NUMA systems. - `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
- `--port`: Set the port to listen. Default: `8080`. - `--port`: Set the port to listen. Default: `8080`
- `--path`: path from which to serve static files (default: disabled) - `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction, Default: disabled. - `--embedding`: Enable embedding extraction. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512`
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) - `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled. - `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) - `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
**If compiled with `LLAMA_SERVER_SSL=ON`** **If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@ -69,7 +67,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
## Build ## Build
server is build alongside everything else from the root of the project `server` is built alongside everything else from the root of the project
- Using `make`: - Using `make`:
@ -85,7 +83,7 @@ server is build alongside everything else from the root of the project
## Build with SSL ## Build with SSL
server can also be built with SSL support using OpenSSL 3 `server` can also be built with SSL support using OpenSSL 3
- Using `make`: - Using `make`:
@ -135,7 +133,7 @@ docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/
## Testing with CURL ## Testing with CURL
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
```sh ```sh
curl --request POST \ curl --request POST \
@ -159,7 +157,7 @@ mkdir llama-client
cd llama-client cd llama-client
``` ```
Create a index.js file and put inside this: Create a index.js file and put this inside:
```javascript ```javascript
const prompt = `Building a website can be done in 10 simple steps:`; const prompt = `Building a website can be done in 10 simple steps:`;
@ -190,8 +188,8 @@ node index.js
- 503 -> `{"status": "loading model"}` if the model is still being loaded. - 503 -> `{"status": "loading model"}` if the model is still being loaded.
- 500 -> `{"status": "error"}` if the model failed to load. - 500 -> `{"status": "error"}` if the model failed to load.
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
@ -205,75 +203,75 @@ node index.js
- The model's `tokenizer.ggml.add_bos_token` metadata is `true` - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
- The system prompt is empty - The system prompt is empty
`temperature`: Adjust the randomness of the generated text (default: 0.8). `temperature`: Adjust the randomness of the generated text. Default: `0.8`
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled). `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
`top_k`: Limit the next token selection to the K most probable tokens (default: 40). `top_k`: Limit the next token selection to the K most probable tokens. Default: `40`
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95). `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05). `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity). `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`stop`: Specify a JSON array of stopping strings. `stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
`tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
`typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
`presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`). `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). `mirostat_eta`: Set the Mirostat learning rate, parameter eta. Default: `0.1`
`grammar`: Set grammar for grammar-based sampling (default: no grammar) `grammar`: Set grammar for grammar-based sampling. Default: no grammar
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). `seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
`ignore_eos`: Ignore end of stream token and continue generating (default: false). `ignore_eos`: Ignore end of stream token and continue generating. Default: `false`
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0) `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false) `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values) `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
### Result JSON ### Result JSON
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
@ -287,7 +285,7 @@ node index.js
}, },
{ {
"prob": float, "prob": float,
"tok_str": "<second most likely tonen>" "tok_str": "<second most likely token>"
}, },
... ...
] ]
@ -357,14 +355,14 @@ Notice that each `probs` is an array of length `n_probs`.
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used. - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.
*Options:* *Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported. See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
*Examples:* *Examples:*
@ -514,16 +512,16 @@ Available metrics:
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed. - `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s. - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s. - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage. - `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
- `llamacpp:kv_cache_tokens`: KV-cache tokens. - `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of request processing. - `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of request deferred. - `llamacpp:requests_deferred`: Number of requests deferred.
## More examples ## More examples
### Change system prompt on runtime ### Change system prompt on runtime
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it. To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
`prompt`: Specify a context that you want all connecting clients to respect. `prompt`: Specify a context that you want all connecting clients to respect.
@ -562,11 +560,11 @@ bash chat.sh
### OAI-like API ### OAI-like API
The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
### API errors ### API errors
Server returns error in the same format as OAI: https://github.com/openai/openai-openapi `server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
Example of an error: Example of an error:

View File

@ -16,6 +16,7 @@ import matplotlib
import matplotlib.dates import matplotlib.dates
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests import requests
from statistics import mean
def main(args_in: list[str] | None = None) -> None: def main(args_in: list[str] | None = None) -> None:
@ -109,6 +110,7 @@ def main(args_in: list[str] | None = None) -> None:
# Prometheus # Prometheus
end_time = time.time() end_time = time.time()
prometheus_metrics = {}
if is_server_listening("0.0.0.0", 9090): if is_server_listening("0.0.0.0", 9090):
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
@ -127,6 +129,7 @@ def main(args_in: list[str] | None = None) -> None:
values = metric_data['data']['result'][0]['values'] values = metric_data['data']['result'][0]['values']
timestamps, metric_values = zip(*values) timestamps, metric_values = zip(*values)
metric_values = [float(value) for value in metric_values] metric_values = [float(value) for value in metric_values]
prometheus_metrics[metric] = metric_values
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
plt.figure(figsize=(16, 10), dpi=80) plt.figure(figsize=(16, 10), dpi=80)
plt.plot(timestamps_dt, metric_values, label=metric) plt.plot(timestamps_dt, metric_values, label=metric)
@ -176,17 +179,20 @@ xychart-beta
# 140 chars max for commit status description # 140 chars max for commit status description
bench_results = { bench_results = {
"i": iterations,
"req": { "req": {
"p90": data['metrics']["http_req_duration"]["p(90)"], "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
"avg": data['metrics']["http_req_duration"]["avg"], "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
}, },
"pp": { "pp": {
"p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"], "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
"avg": data['metrics']["llamacpp_prompt_tokens"]["avg"], "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
}, },
"tg": { "tg": {
"p90": data['metrics']["llamacpp_tokens_second"]["p(90)"], "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
"avg": data['metrics']["llamacpp_tokens_second"]["avg"], "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
}, },
} }
with open("results.github.env", 'a') as github_env: with open("results.github.env", 'a') as github_env:

View File

@ -87,6 +87,7 @@ export default function () {
], ],
"model": model, "model": model,
"stream": false, "stream": false,
"seed": 42,
"max_tokens": max_tokens "max_tokens": max_tokens
} }

File diff suppressed because it is too large Load Diff

View File

@ -222,6 +222,7 @@
temperature: 0.7, temperature: 0.7,
repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled repeat_penalty: 1.18, // 1.0 = disabled
penalize_nl: false,
top_k: 40, // <= 0 to use vocab size top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled min_p: 0.05, // 0 = disabled
@ -627,6 +628,7 @@
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value } const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) } const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) } const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
const grammarJsonSchemaPropOrder = signal('') const grammarJsonSchemaPropOrder = signal('')
const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
@ -670,6 +672,15 @@
` `
}; };
const BoolField = ({ label, name, value }) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
</div>
`
};
const userTemplateReset = (e) => { const userTemplateReset = (e) => {
e.preventDefault(); e.preventDefault();
userTemplateResetToDefaultAndApply() userTemplateResetToDefaultAndApply()
@ -769,6 +780,7 @@
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}

View File

@ -2189,8 +2189,6 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch); printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_supports_mlock()) { if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
@ -2213,6 +2211,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row)\n"); printf(" or for intermediate results and KV (with split-mode = row)\n");
printf(" -nkvo, --no-kv-offload\n");
printf(" disable KV offload\n");
} }
printf(" -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
@ -2498,6 +2498,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
"See main README.md for information on enabling GPU BLAS support", "See main README.md for information on enabling GPU BLAS support",
{{"n_gpu_layers", params.n_gpu_layers}}); {{"n_gpu_layers", params.n_gpu_layers}});
} }
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
params.no_kv_offload = true;
} else if (arg == "--split-mode" || arg == "-sm") { } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;

View File

@ -49,12 +49,23 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
template <typename T> template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) { static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value // Fallback null to default value
return body.contains(key) && !body.at(key).is_null() if (body.contains(key) && !body.at(key).is_null()){
? body.value(key, default_value) try {
: default_value; return body.value(key, default_value);
}
catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
server_log("WARN", __func__, __LINE__, message.c_str(), body);
return default_value;
}
} else {
return default_value;
}
} }
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {

6
flake.lock generated
View File

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1711163522, "lastModified": 1711703276,
"narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=", "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4", "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id; galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset; if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.size_max = 0;
} else {
galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
} }
// reallocate buffers if needed // reallocate buffers if needed

View File

@ -447,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_SYCL) #elif defined(GGML_COMMON_IMPL_SYCL)
#include <cstdint> #include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), { #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() }); #define GGML_TABLE_END() };
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#endif #endif

View File

@ -401,10 +401,8 @@ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
if (tensor->view_src != NULL && tensor->view_offs == 0) { if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft); assert(tensor->view_src->buffer->buft == buffer->buft);
tensor->backend = tensor->view_src->backend;
tensor->extra = tensor->view_src->extra;
return; return;
} }
@ -1962,227 +1960,49 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
} }
} }
#if 0
template<typename ... Srcs>
static __global__ void k_compute_batched_ptrs_id(
const void ** ptrs_src, void ** ptrs_dst,
int ne12, int ne13,
int ne23,
int nb02, int nb03,
int nb12, int nb13,
int nb2, int nb3,
int r2, int r3,
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
const half * src1_f16, half * dst_f16,
const int32_t * ids, const int id,
Srcs... src0s) {
int i = ids[id];
half * src0_f16;
const void * srcs_ar[] = { (const half *) src0s... };
if (src0_type == GGML_TYPE_F16) {
src0_f16 = (half *) srcs_ar[i];
} else {
src0_f16 = src0_as_f16;
if (threadIdx.x == 0 && threadIdx.y == 0) {
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
}
}
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
int i03 = i13 / r3;
int i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
}
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src00 = dst->src[2];
const int id = dst->op_params[0];
GGML_ASSERT(!ggml_is_transposed(src00));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
const int64_t ne01 = src00->ne[1];
const int64_t ne02 = src00->ne[2];
const int64_t ne03 = src00->ne[3];
//const int64_t nb01 = src00->nb[1];
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
//const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
//void * src0_ddq = src0_extra->data_device[g_main_device];
//half * src0_as_f16 = (half *) src0_ddq;
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
// convert src1 to fp16
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0;
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
// use cublasGemmBatchedEx
const int ne23 = ne12*ne13;
const void ** ptrs_src = nullptr;
void ** ptrs_dst = nullptr;
size_t ptrs_src_s = 0;
size_t ptrs_dst_s = 0;
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
int64_t src0_ne = ggml_nelements(src00);
half * src0_as_f16 = nullptr;
size_t src0_as = 0;
if (src00->type != GGML_TYPE_F16) {
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
}
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
ptrs_src, ptrs_dst,
ne12, ne13,
ne23,
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
nb12, nb13,
dst->nb[2], dst->nb[3],
r2, r3,
src00->type, src0_as_f16, src0_ne,
src1_as_f16, dst_f16,
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
);
CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
ne23,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (src0_as != 0) {
ggml_cuda_pool_free(src0_as_f16, src0_as);
}
if (ptrs_src_s != 0) {
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
}
if (ptrs_dst_s != 0) {
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
}
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
ggml_cuda_pool_free(src1_as_f16, src1_as);
ggml_cuda_pool_free(dst_f16, dst_as);
}
#endif
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
#if 0
ggml_cuda_mul_mat_id_cublas(dst);
// TODO: mmq/mmv support
#endif
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * ids = dst->src[2];
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
const size_t nb11 = src1->nb[1]; const size_t nb11 = src1->nb[1];
const size_t nb1 = dst->nb[1]; const size_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0]; const int32_t id = ((int32_t *) dst->op_params)[0];
const int32_t n_as = ((int32_t *) dst->op_params)[1]; const int32_t n_as = src0->ne[2];
std::vector<char> ids_host(ggml_nbytes(ids)); std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data; const char * ids_dev = (const char *) ids->data;
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamSynchronize(stream));
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1; ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst; ggml_tensor dst_row = *dst;
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data; char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data; char * dst_original = (char *) dst->data;
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[3] = src0->nb[2];
if (src1->ne[1] == 1) { if (src1->ne[1] == 1) {
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < n_as); GGML_ASSERT(row_id >= 0 && row_id < n_as);
const struct ggml_tensor * src0_row = dst->src[row_id + 2]; src0_row.data = src0_original + row_id*src0->nb[2];
src1_row.data = src1_original + i01*src1->nb[1]; src1_row.data = src1_original + i01*src1->nb[1];
dst_row.data = dst_original + i01*dst->nb[1]; dst_row.data = dst_original + i01*dst->nb[1];
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
} }
} else { } else {
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@ -2192,8 +2012,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
dst_row.data = dst_contiguous.get(); dst_row.data = dst_contiguous.get();
for (int32_t row_id = 0; row_id < n_as; ++row_id) { for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
int64_t num_src1_rows = 0; int64_t num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@ -2213,6 +2031,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
continue; continue;
} }
src0_row.data = src0_original + row_id*src0->nb[2];
src1_row.ne[1] = num_src1_rows; src1_row.ne[1] = num_src1_rows;
dst_row.ne[1] = num_src1_rows; dst_row.ne[1] = num_src1_rows;
@ -2224,7 +2044,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1;
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
num_src1_rows = 0; num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@ -2389,7 +2209,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst)); fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(false); CUDA_CHECK(err);
} }
return true; return true;

View File

@ -8,32 +8,41 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
} }
template<ggml_sort_order order> template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) { static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
// bitonic sort // bitonic sort
int col = threadIdx.x; int col = threadIdx.x;
int row = blockIdx.y; int row = blockIdx.y;
if (col >= ncols) return; if (col >= ncols_pad) {
return;
}
const float * x_row = x + row * ncols; const float * x_row = x + row * ncols;
int * dst_row = dst + row * ncols; extern __shared__ int dst_row[];
// initialize indices // initialize indices
if (col < ncols) { dst_row[col] = col;
dst_row[col] = col;
}
__syncthreads(); __syncthreads();
for (int k = 2; k <= ncols; k *= 2) { for (int k = 2; k <= ncols_pad; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) { for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j; int ixj = col ^ j;
if (ixj > col) { if (ixj > col) {
if ((col & k) == 0) { if ((col & k) == 0) {
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { if (dst_row[col] >= ncols ||
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]); ggml_cuda_swap(dst_row[col], dst_row[ixj]);
} }
} else { } else {
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { if (dst_row[ixj] >= ncols ||
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]); ggml_cuda_swap(dst_row[col], dst_row[ixj]);
} }
} }
@ -41,18 +50,35 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
__syncthreads(); __syncthreads();
} }
} }
// copy the result to dst without the padding
if (col < ncols) {
dst[row * ncols + col] = dst_row[col];
}
}
static int next_power_of_2(int x) {
int n = 1;
while (n < x) {
n *= 2;
}
return n;
} }
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2 // bitonic sort requires ncols to be power of 2
GGML_ASSERT((ncols & (ncols - 1)) == 0); const int ncols_pad = next_power_of_2(ncols);
const dim3 block_dims(ncols, 1, 1); const dim3 block_dims(ncols_pad, 1, 1);
const dim3 block_nums(1, nrows, 1); const dim3 block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) { if (order == GGML_SORT_ORDER_ASC) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else if (order == GGML_SORT_ORDER_DESC) { } else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }

View File

@ -1,7 +1,8 @@
#pragma once #pragma once
#include "../ggml.h" #include "ggml.h"
#include "../ggml-cuda.h" #include "ggml-cuda.h"
#include <memory> #include <memory>
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
@ -11,7 +12,7 @@
#define GGML_COMMON_DECL_CUDA #define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA #define GGML_COMMON_IMPL_CUDA
#endif #endif
#include "../ggml-common.h" #include "ggml-common.h"
#include <cstdio> #include <cstdio>
#include <array> #include <array>

View File

@ -2,14 +2,6 @@
#include "dequantize.cuh" #include "dequantize.cuh"
#include "convert.cuh" #include "convert.cuh"
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_MMV_Y
#define GGML_CUDA_MMV_Y 1
#endif
#ifndef K_QUANTS_PER_ITERATION #ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 2 #define K_QUANTS_PER_ITERATION 2
#else #else

View File

@ -1,5 +1,16 @@
#include "common.cuh" #include "common.cuh"
// dmmv = dequantize_mul_mat_vec
// TODO: remove this?
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_MMV_Y
#define GGML_CUDA_MMV_Y 1
#endif
void ggml_cuda_op_dequantize_mul_mat_vec( void ggml_cuda_op_dequantize_mul_mat_vec(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

View File

@ -1685,37 +1685,31 @@ static enum ggml_status ggml_metal_graph_compute(
{ {
//GGML_ASSERT(ne00 == ne10); //GGML_ASSERT(ne00 == ne10);
//GGML_ASSERT(ne03 == ne13); //GGML_ASSERT(ne03 == ne13);
const int n_as = src0->ne[2];
GGML_ASSERT(src0t == GGML_TYPE_I32);
const int n_as = ((int32_t *) dst->op_params)[1];
// TODO: make this more general
GGML_ASSERT(n_as <= 8);
// max size of the src1ids array in the kernel shared buffer // max size of the src1ids array in the kernel shared buffer
GGML_ASSERT(ne11 <= 4096); GGML_ASSERT(ne11 <= 4096);
const int64_t ne20 = src2 ? src2->ne[0] : 0; // src2 = ids
const int64_t ne21 = src2 ? src2->ne[1] : 0; const int64_t ne20 = src2->ne[0]; GGML_UNUSED(ne20);
const int64_t ne22 = src2 ? src2->ne[2] : 0; const int64_t ne21 = src2->ne[1];
const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
const uint64_t nb21 = src2 ? src2->nb[1] : 0; const uint64_t nb21 = src2->nb[1];
const uint64_t nb22 = src2 ? src2->nb[2] : 0; const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
GGML_ASSERT(!ggml_is_transposed(src2)); GGML_ASSERT(src2t == GGML_TYPE_I32);
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src1t == GGML_TYPE_F32); GGML_ASSERT(src1t == GGML_TYPE_F32);
const uint r2 = ne12/ne22;
const uint r3 = ne13/ne23;
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = n_as; int ne11_mm_min = n_as;
@ -1723,7 +1717,10 @@ static enum ggml_status ggml_metal_graph_compute(
const int idx = ((int32_t *) dst->op_params)[0]; const int idx = ((int32_t *) dst->op_params)[0];
// batch size // batch size
GGML_ASSERT(ne01 == ne11); GGML_ASSERT(ne21 == ne11); // ?
GGML_ASSERT(ne12 == 1 && ne13 == 1); // no broadcasting
const uint r2 = 1;
const uint r3 = 1;
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@ -1732,7 +1729,7 @@ static enum ggml_status ggml_metal_graph_compute(
// indirect matrix multiplication // indirect matrix multiplication
// !!! // !!!
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne20 % 32 == 0 && ne20 >= 64 && ne00 % 32 == 0 && ne00 >= 64 &&
ne11 > ne11_mm_min) { ne11 > ne11_mm_min) {
// some Metal matrix data types require aligned pointers // some Metal matrix data types require aligned pointers
@ -1745,7 +1742,7 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
switch (src2->type) { switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break;
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break;
@ -1774,36 +1771,27 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:15];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:16];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
[encoder setBytes:&idx length:sizeof(idx) atIndex:18]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
// TODO: how to make this an array? read Metal docs [encoder setBytes:&idx length:sizeof(idx) atIndex:19];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
}
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0]; [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else { } else {
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
@ -1813,7 +1801,7 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
// use custom matrix x vector kernel // use custom matrix x vector kernel
switch (src2t) { switch (src0t) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
GGML_ASSERT(src1t == GGML_TYPE_F32); GGML_ASSERT(src1t == GGML_TYPE_F32);
@ -1947,8 +1935,8 @@ static enum ggml_status ggml_metal_graph_compute(
} }
}; };
if (ggml_is_quantized(src2t)) { if (ggml_is_quantized(src0t)) {
GGML_ASSERT(ne20 >= nth0*nth1); GGML_ASSERT(ne00 >= nth0*nth1);
} }
const int64_t _ne1 = 1; // kernels needs a reference in constant memory const int64_t _ne1 = 1; // kernels needs a reference in constant memory
@ -1957,75 +1945,66 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
[encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:6];
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:7];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8]; [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:8];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11]; [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:19];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:20]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:20];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:21];
[encoder setBytes:&idx length:sizeof(idx) atIndex:22]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:22];
// TODO: how to make this an array? read Metal docs [encoder setBytes:&idx length:sizeof(idx) atIndex:23];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur); src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 || const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q2_K ||
src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) { else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_IQ4_NL || src2t == GGML_TYPE_IQ4_XS) { else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
const int mem_size = 32*sizeof(float); const int mem_size = 32*sizeof(float);
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_Q4_K) { else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else #else
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif #endif
} }
else if (src2t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
const int64_t ny = (_ne1 + nrows - 1)/nrows; const int64_t ny = (_ne1 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
} break; } break;
@ -2432,6 +2411,16 @@ static enum ggml_status ggml_metal_graph_compute(
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
// bitonic sort requires the number of elements to be power of 2
int64_t ne00_padded = 1;
while (ne00_padded < ne00) {
ne00_padded *= 2;
}
// Metal kernels require the buffer size to be multiple of 16 bytes
// https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
switch (order) { switch (order) {
@ -2441,11 +2430,13 @@ static enum ggml_status ggml_metal_graph_compute(
}; };
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
} break; } break;
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
{ {

File diff suppressed because it is too large Load Diff

View File

@ -8079,7 +8079,7 @@ template <bool need_check> static void
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl> template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
const sycl::nd_item<3> &item_ct1, const sycl::nd_item<3> &item_ct1,
const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) { const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1); item_ct1.get_local_id(1);
@ -9956,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
{ {
iq2xxs_grid.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
dpct::has_capability_or_fail(stream->get_device(), dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16}); {sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
@ -9985,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
{ {
iq2xs_grid.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
dpct::has_capability_or_fail(stream->get_device(), dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16}); {sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
@ -10014,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
{ {
iq3xxs_grid.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
dpct::has_capability_or_fail(stream->get_device(), dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16}); {sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
@ -10043,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
{ {
iq3s_grid.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
dpct::has_capability_or_fail(stream->get_device(), dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16}); {sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
@ -10072,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
{ {
iq1s_grid_gpu.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
dpct::has_capability_or_fail(stream->get_device(), dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16}); {sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
@ -10415,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10428,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>( VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10444,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10457,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>( VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10473,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10486,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>( VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10502,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10515,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>( VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10531,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10544,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>( VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10560,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10573,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>( VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10589,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10602,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>( VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10618,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10631,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>( VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10647,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10660,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>( VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
@ -10676,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10689,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>( VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1, vx, vy, dst, ncols, nrows, item_ct1);
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
}); });
}); });
} }
} }
static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols, float *dst, const int ncols,
const int nrows, const int nrows,
@ -10705,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq2xxs_grid.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10736,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq2xs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10764,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3xxs_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10792,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq3s_grid.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10820,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{ {
iq1s_grid_gpu.init(*stream);
ksigns64.init(*stream);
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -11,17 +11,6 @@ extern "C" {
#define GGML_VK_MAX_DEVICES 16 #define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void); GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
GGML_API void ggml_vk_free_cpu_assist(void);
// backend API // backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);

92
ggml.c
View File

@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
#include "ggml-opencl.h" #include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif #endif
#elif defined(GGML_USE_OPENBLAS) #elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL) #if defined(GGML_BLAS_USE_MKL)
@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#endif #endif
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h" #include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init_cpu_assist();
#endif #endif
ggml_setup_op_has_task_pass(); ggml_setup_op_has_task_pass();
@ -4579,45 +4573,38 @@ void ggml_mul_mat_set_prec(
// ggml_mul_mat_id // ggml_mul_mat_id
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
// this will allow computing all the used experts in a single matrix multiplication
struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * const as[], struct ggml_tensor * as,
int n_as,
struct ggml_tensor * ids, struct ggml_tensor * ids,
int id, int id,
struct ggml_tensor * b) { struct ggml_tensor * b) {
GGML_ASSERT(ids->type == GGML_TYPE_I32); GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
GGML_ASSERT(ids->ne[1] == b->ne[1]); GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
GGML_ASSERT(id >= 0 && id < ids->ne[0]); GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
bool is_node = false; bool is_node = false;
if (as[0]->grad || b->grad) { if (as->grad || b->grad) {
is_node = true; is_node = true;
} }
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_i32(result, 0, id); ggml_set_op_params_i32(result, 0, id);
ggml_set_op_params_i32(result, 1, n_as);
result->op = GGML_OP_MUL_MAT_ID; result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids; result->src[0] = as;
result->src[1] = b; result->src[1] = b;
result->src[2] = ids;
for (int i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));
result->src[i + 2] = a;
}
return result; return result;
} }
@ -10954,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * ids = dst->src[2];
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
@ -10987,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3); GGML_ASSERT(nb2 <= nb3);
// broadcast factors // broadcast is not supported with mmid
const int64_t r2 = ne12/ne02; assert(ne12 == 1);
const int64_t r3 = ne13/ne03; assert(ne13 == 1);
// row groups // row groups
const int id = ggml_get_op_params_i32(dst, 0); const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = ggml_get_op_params_i32(dst, 1); const int n_as = src0->ne[2];
char * wdata_src1_end = (src1->type == vec_dot_type) ? char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata : (char *) params->wdata :
@ -11053,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
continue; continue;
} }
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; size_t src0_offset = cur_a*src0->nb[2];
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -11088,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
continue; continue;
} }
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);
// block-tiling attempt // block-tiling attempt
const int64_t blck_0 = 16; const int64_t blck_0 = 16;
const int64_t blck_1 = 16; const int64_t blck_1 = 16;
@ -11107,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
// broadcast src0 into src1 // broadcast src0 into src1
const int64_t i03 = i13/r3; //const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; //const int64_t i02 = i12/r2;
const int64_t i1 = i11; const int64_t i1 = i11;
const int64_t i2 = i12; const int64_t i2 = i12;
const int64_t i3 = i13; const int64_t i3 = i13;
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); const char * src0_row = (const char *) src0->data + src0_offset;
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@ -16128,20 +16111,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return; return;
} }
#if defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
ggml_vk_check_results_1_cpu_assist(params, tensor);
}
#endif
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#endif // GGML_USE_VULKAN
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP: case GGML_OP_DUP:
{ {
@ -18484,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
cur = 0; cur = 0;
const struct ggml_tensor * src0 = node->src[2]; const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
} }
const int n_as = ggml_get_op_params_i32(node, 1); const int n_as = src0->ne[2];
cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@ -18617,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
} }
} }
#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers_cpu_assist();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
#endif
const int n_threads = cplan->n_threads; const int n_threads = cplan->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
@ -18684,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
} }
} }
#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup_cpu_assist();
#endif
// performance stats (graph) // performance stats (graph)
{ {
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

3
ggml.h
View File

@ -1164,8 +1164,7 @@ extern "C" {
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id( GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * const as[], struct ggml_tensor * as,
int n_as,
struct ggml_tensor * ids, struct ggml_tensor * ids,
int id, int id,
struct ggml_tensor * b); struct ggml_tensor * b);

View File

@ -18,6 +18,12 @@ shader_int8_ext = """
""" """
# Type-specific defines # Type-specific defines
shader_f32_defines = """
#define QUANT_K 1
#define QUANT_R 1
#define A_TYPE float
"""
shader_f16_defines = """ shader_f16_defines = """
#define QUANT_K 1 #define QUANT_K 1
#define QUANT_R 1 #define QUANT_R 1
@ -157,8 +163,8 @@ struct block_q6_K
""" """
# Dequant functions # Dequant functions
shader_f16_dequant_func = """ shader_float_dequant_func = """
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]); #define DEQUANT_FUNC vec2 v = vec2(ib, ib); // data_a[ib], data_a[ib + 1]);
""" """
shader_q4_0_dequant_func = """ shader_q4_0_dequant_func = """
@ -410,6 +416,133 @@ mulmat_load_q8_0 = """
buf_a[buf_idx ] = FLOAT_TYPE(v.x); buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);""" buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
mulmat_load_q2_K = """
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint iqs = idx % 128; // 0..127
const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
const uint scalesi = iqs / 8; // 0..15
const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
const uint scales = data_a[ib].scales[scalesi];
const vec2 d = vec2(data_a[ib].d);
const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
mulmat_load_q3_K = """
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint iqs = idx % 128; // 0..127
const uint n = iqs / 64; // 0,1
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
const uint hmi = (iqs % 16) * 2; // 0,2,4..30
const uint j = (iqs % 64) / 4; // 0..3
const uint is = iqs / 8; // 0..15
const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3
const uint qsshift = halfsplit * 2; // 0,2,4,6
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
const int8_t us = int8_t(is < 4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) :
is < 8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) :
is < 12 ? (data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
(data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
const float dl = float(data_a[ib].d) * float(us - 32);
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));"""
mulmat_load_q4_K = """
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint iqs = idx % 128; // 0..127
const uint n = iqs / 32; // 0,1,2,3
const uint b = (iqs % 32) / 16; // 0,1
const uint is = 2 * n + b; // 0..7
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
const vec2 loadd = vec2(data_a[ib].d);
uint8_t sc;
uint8_t mbyte;
if (is < 4) {
sc = uint8_t(data_a[ib].scales[is ] & 63);
mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
} else {
sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
}
const float d = loadd.x * sc;
const float m = loadd.y * mbyte;
buf_a[buf_idx ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) - m);
buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);"""
mulmat_load_q5_K = """
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint iqs = idx % 128; // 0..127
const uint n = iqs / 32; // 0,1,2,3
const uint b = (iqs % 32) / 16; // 0,1
const uint is = 2 * n + b; // 0..7
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
const uint qhi = (iqs % 16) * 2; // 0,2,4..30
const uint8_t hm = uint8_t(1 << (iqs / 16));
const vec2 loadd = vec2(data_a[ib].d);
uint8_t sc;
uint8_t mbyte;
if (is < 4) {
sc = uint8_t(data_a[ib].scales[is ] & 63);
mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
} else {
sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
}
const float d = loadd.x * sc;
const float m = loadd.y * mbyte;
buf_a[buf_idx ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0)) - m);
buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);"""
mulmat_load_q6_K = """
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint iqs = idx % 128; // 0..127
const uint n = iqs / 64; // 0,1
const uint b = (iqs % 64) / 32; // 0,1
const uint is_b = (iqs % 16) / 8; // 0,1
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
const uint is = 8 * n + qhshift + is_b; // 0..15
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));"""
mulmat_body2 = """ mulmat_body2 = """
} }
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
@ -1611,8 +1744,9 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint d_offset; uint d_offset;
float param1; float param2; float param1; float param2;
} p; } p;"""
generic_unary_op_funcs = """
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -1636,14 +1770,17 @@ uint dst_idx(uint idx) {
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
} }"""
generic_unary_op_main = """
void main() { void main() {
if (gl_GlobalInvocationID.x >= p.ne) { if (gl_GlobalInvocationID.x >= p.ne) {
return; return;
} }
""" """
generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_funcs}\n{generic_unary_op_main}"
generic_binary_op_head = """#version 450 generic_binary_op_head = """#version 450
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
@ -1655,13 +1792,14 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
uint d_offset; uint d_offset;
uint param1; uint param2; float param1; float param2;
} p; } p;"""
generic_binary_op_funcs = """
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {A_TYPE data_b[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
uint src0_idx(uint idx) { uint src0_idx(uint idx) {
@ -1693,14 +1831,17 @@ uint dst_idx(uint idx) {
const uint i21 = (idx - i23_offset - i22_offset) / p.ne20; const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20; const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20; return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
} }"""
generic_binary_op_main = """
void main() { void main() {
if (gl_GlobalInvocationID.x >= p.ne) { if (gl_GlobalInvocationID.x >= p.ne) {
return; return;
} }
""" """
generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_funcs}\n{generic_binary_op_main}"
# MUL F32 # MUL F32
mul_body = """ mul_body = """
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
@ -1745,39 +1886,55 @@ cpy_f16_f16_end = """
""" """
# GET_ROWS # GET_ROWS
get_rows_body = """ get_rows_float_body = """
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_8bit_storage : require
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {int data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
void main() { void main() {
const uint col = int(gl_GlobalInvocationID.x) * 2; const uint i00 = gl_GlobalInvocationID.x;
const uint row = int(gl_GlobalInvocationID.y); const uint i10 = gl_GlobalInvocationID.y;
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
if (col >= p.KY) { if (i00 >= p.ne00) {
return; return;
} }
const uint r = uint(data_b[row]); const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
// copy data_a[r*p.KY + col] to dst[row*p.KX + col] const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
const uint xi = r*p.KY + col; const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
const uint di = row*p.KY + col;
const uint ib = xi/QUANT_K; // block index #ifndef OPTIMIZATION_ERROR_WORKAROUND
const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
const uint iybs = di - di%QUANT_K; // y block start index #else
data_d[d_offset + i00] = data_a[a_offset + i00];
#endif
}
"""
get_rows_body = """
void main() {
const uint i00 = (gl_GlobalInvocationID.x)*2;
const uint i10 = gl_GlobalInvocationID.y;
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
if (i00 >= p.ne00) {
return;
}
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
const uint ib = a_offset + i00/QUANT_K; // block index
const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
const uint iybs = i00 - i00%QUANT_K; // dst block start index
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
DEQUANT_FUNC DEQUANT_FUNC
dst[iybs + iqs + 0] = D_TYPE(v.x); data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
dst[iybs + iqs + y_offset] = D_TYPE(v.y); data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
} }
""" """
@ -2418,6 +2575,31 @@ async def main():
tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
stream.clear()
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2))
tasks.append(string_to_spv("matmul_q2_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q2_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
stream.clear()
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2))
tasks.append(string_to_spv("matmul_q3_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q3_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
stream.clear()
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2))
tasks.append(string_to_spv("matmul_q4_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q4_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
stream.clear()
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2))
tasks.append(string_to_spv("matmul_q5_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q5_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
stream.clear()
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2))
tasks.append(string_to_spv("matmul_q6_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv("matmul_q6_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
# Shaders where precision is needed, so no fp16 version # Shaders where precision is needed, so no fp16 version
# mul mat vec # mul mat vec
@ -2426,7 +2608,7 @@ async def main():
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32)) stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
if i == GGML_TYPE_F16: if i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body)) stream.extend((shader_f16_defines, shader_float_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q4_0: elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body)) stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
elif i == GGML_TYPE_Q4_1: elif i == GGML_TYPE_Q4_1:
@ -2488,25 +2670,32 @@ async def main():
# get_rows # get_rows
for i in range(0, VK_NUM_TYPES): for i in range(0, VK_NUM_TYPES):
stream.clear() stream.clear()
stream.extend((generic_head, shader_int8_ext, shader_f32)) stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32))
optimization_workaround = False
if i == GGML_TYPE_F16: if i == GGML_TYPE_F32:
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body)) stream.extend((shader_f32_defines, generic_binary_op_funcs, get_rows_float_body))
elif i == GGML_TYPE_F16:
stream.extend((shader_f16_defines, generic_binary_op_funcs, get_rows_float_body))
optimization_workaround = True
elif i == GGML_TYPE_Q4_0: elif i == GGML_TYPE_Q4_0:
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body)) stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body))
elif i == GGML_TYPE_Q4_1: elif i == GGML_TYPE_Q4_1:
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body)) stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body))
elif i == GGML_TYPE_Q5_0: elif i == GGML_TYPE_Q5_0:
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body)) stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body))
elif i == GGML_TYPE_Q5_1: elif i == GGML_TYPE_Q5_1:
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body)) stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body))
elif i == GGML_TYPE_Q8_0: elif i == GGML_TYPE_Q8_0:
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body)) stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body))
else: else:
continue continue
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"})) if optimization_workaround:
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
else:
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
@ -2515,20 +2704,20 @@ async def main():
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"})) tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {})) tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))

View File

@ -123,6 +123,7 @@ class MODEL_ARCH(IntEnum):
GEMMA = auto() GEMMA = auto()
STARCODER2 = auto() STARCODER2 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
@ -191,6 +192,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
} }
@ -219,9 +221,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
@ -365,6 +367,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_ACT, MODEL_TENSOR.FFN_ACT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.POS_EMBD,
], ],
MODEL_ARCH.GPTJ: [ MODEL_ARCH.GPTJ: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -606,6 +611,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT, MODEL_TENSOR.SSM_OUT,
], ],
MODEL_ARCH.XVERSE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.COMMAND_R: [ MODEL_ARCH.COMMAND_R: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -650,6 +671,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.XVERSE: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
} }
# #

View File

@ -231,9 +231,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok
), ),
# AWQ-activation gate # AWQ-activation gate
@ -252,9 +251,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral "transformer.decoder_layer.{bid}.moe.linear" # Grok (merged)
"transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
), ),
# Feed-forward down # Feed-forward down
@ -280,20 +278,20 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
), ),
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
), ),
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.8.0" version = "0.9.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

635
llama.cpp

File diff suppressed because it is too large Load Diff

28
llama.h
View File

@ -1009,10 +1009,38 @@ extern "C" {
struct ggml_tensor; struct ggml_tensor;
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map( const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx struct llama_context * ctx
); );
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL
#endif // LLAMA_H #endif // LLAMA_H

View File

@ -178,6 +178,9 @@ def get_commit_hexsha8(name):
for t in repo.tags: for t in repo.tags:
if t.name == name: if t.name == name:
return t.commit.hexsha[:8] return t.commit.hexsha[:8]
for c in repo.iter_commits("--all"):
if c.hexsha[:8] == name[:8]:
return c.hexsha[:8]
return None return None
@ -224,7 +227,7 @@ if known_args.compare is not None:
hexsha8_compare = get_commit_hexsha8(known_args.compare) hexsha8_compare = get_commit_hexsha8(known_args.compare)
name_compare = known_args.compare name_compare = known_args.compare
if hexsha8_compare is None: if hexsha8_compare is None:
print(f"ERROR: cannot find data for baseline={known_args.compare}.") print(f"ERROR: cannot find data for compare={known_args.compare}.")
sys.exit(1) sys.exit(1)
# Otherwise, search for the commit for llama-bench was most recently run # Otherwise, search for the commit for llama-bench was most recently run
# and that is not a parent of master: # and that is not a parent of master:

View File

@ -95,6 +95,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-backend-impl.h -> ggml-backend-impl.h # src/ggml-backend-impl.h -> ggml-backend-impl.h
# src/ggml-backend.c -> ggml-backend.c # src/ggml-backend.c -> ggml-backend.c
# src/ggml-common.h -> ggml-common.h # src/ggml-common.h -> ggml-common.h
# src/ggml-cuda/* -> ggml-cuda/
# src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.cu -> ggml-cuda.cu
# src/ggml-cuda.h -> ggml-cuda.h # src/ggml-cuda.h -> ggml-cuda.h
# src/ggml-impl.h -> ggml-impl.h # src/ggml-impl.h -> ggml-impl.h
@ -128,6 +129,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
-e 's/src\/ggml-common\.h/ggml-common.h/g' \ -e 's/src\/ggml-common\.h/ggml-common.h/g' \
-e 's/src\/ggml-cuda\//ggml-cuda\//g' \
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \

View File

@ -5,6 +5,7 @@ cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h
cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h

View File

@ -979,17 +979,13 @@ struct test_mul_mat_id : public test_case {
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
// C^T = A * B^T: (k, m) * (k, n) => (m, n) // C^T = A * B^T: (k, m) * (k, n) => (m, n)
std::vector<ggml_tensor *> mats; ggml_tensor * mats = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
for (int i = 0; i < n_mats; i++) {
ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
mats.push_back(a);
}
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
if (v) { if (v) {
ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0); ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
} }
ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b); ggml_tensor * out = ggml_mul_mat_id(ctx, mats, ids, v ? id/2 : id, b);
return out; return out;
} }
@ -1477,91 +1473,6 @@ struct test_leaky_relu : public test_case {
} }
}; };
// Mixtral MOE
struct test_moe : public test_case {
const int n_experts;
const int n_experts_per_tok;
const int n_tokens;
const int n_embd;
const int n_ff;
std::string op_desc(ggml_tensor * t) override {
return "MOE";
GGML_UNUSED(t);
}
std::string vars() override {
return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
}
test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
: n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
std::vector<ggml_tensor *> ffn_up_exp(n_experts);
std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
std::vector<ggml_tensor *> ffn_down_exp(n_experts);
for (int i = 0; i < n_experts; ++i) {
ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
}
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
ggml_tensor * weights = ggml_get_rows(ctx,
ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
weights = ggml_div(ctx, weights, weights_sum);
// compute expert outputs
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_experts_per_tok; ++i) {
ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
cur_gate = ggml_silu(ctx, cur_gate);
cur_expert = ggml_mul(ctx, cur_up, cur_gate);
cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
cur_expert = ggml_mul(ctx, cur_expert,
ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx, moe_out, cur_expert);
}
}
cur = moe_out;
return cur;
}
};
enum llm_norm_type { enum llm_norm_type {
LLM_NORM, LLM_NORM,
LLM_NORM_RMS, LLM_NORM_RMS,
@ -2169,6 +2080,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
} }
test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_sum_rows());
@ -2182,11 +2094,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
// these tests are disabled to save execution time, but they can be handy for debugging // these tests are disabled to save execution time, but they can be handy for debugging
#if 0 #if 0
#if !defined(__SANITIZE_THREAD__)
// FIXME: these tests use too much memory with thread sanitizer
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
#endif
test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(1));
test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1)); test_cases.emplace_back(new test_falcon(1));

View File

@ -33,6 +33,18 @@ int main(void) {
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}", "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
// OrionStarAI/Orion-14B-Chat // OrionStarAI/Orion-14B-Chat
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
// openchat/openchat-3.5-0106
// The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
// So we match against the included template but implement the suggested version.
"{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
// deepseek-ai/deepseek-coder-33b-instruct
"{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
// eachadea/vicuna-13b-1.1
// No template included in tokenizer_config.json, so this template likely needs to be manually set.
"{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
// Orca-Vicuna
// No template included in tokenizer_config.json, so this template likely needs to be manually set.
"{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
}; };
std::vector<std::string> expected_output = { std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B // teknium/OpenHermes-2.5-Mistral-7B
@ -49,6 +61,14 @@ int main(void) {
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n", "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
// OrionStarAI/Orion-14B-Chat // OrionStarAI/Orion-14B-Chat
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>", "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
// openchat/openchat-3.5-0106
"You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant: I am an assistant <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
// deepseek-ai/deepseek-coder-33b-instruct
"You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n I am an assistant \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
// eachadea/vicuna-13b-1.1
"You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:",
// Orca-Vicuna
"SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:",
}; };
std::vector<char> formatted_chat(1024); std::vector<char> formatted_chat(1024);
int32_t res; int32_t res;