diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 758796632..6dc91d27b 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ on:
- cron: '04 2 * * *'
concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true
jobs:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ff7238aba..2d747e688 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,6 +32,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
@@ -52,7 +54,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Determine tag name
id: tag
@@ -88,6 +90,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
@@ -101,7 +105,9 @@ jobs:
sysctl -a
mkdir build
cd build
- cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+ # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -204,26 +210,28 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential
+ sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON
+ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -236,6 +244,33 @@ jobs:
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+ name: llama-bin-ubuntu-x64.zip
+
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
@@ -558,6 +593,63 @@ jobs:
run: |
make swift
+ windows-msys2:
+ runs-on: windows-latest
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - { sys: UCRT64, env: ucrt-x86_64, build: Release }
+ - { sys: CLANG64, env: clang-x86_64, build: Release }
+
+ steps:
+ - name: Clone
+ uses: actions/checkout@v4
+
+ - name: Setup ${{ matrix.sys }}
+ uses: msys2/setup-msys2@v2
+ with:
+ update: true
+ msystem: ${{matrix.sys}}
+ install: >-
+ base-devel
+ mingw-w64-${{matrix.env}}-toolchain
+ mingw-w64-${{matrix.env}}-cmake
+ mingw-w64-${{matrix.env}}-openblas
+
+ - name: Build using make
+ shell: msys2 {0}
+ run: |
+ make -j $(nproc)
+
+ - name: Clean after building using make
+ shell: msys2 {0}
+ run: |
+ make clean
+
+ - name: Build using make w/ OpenBLAS
+ shell: msys2 {0}
+ run: |
+ make LLAMA_OPENBLAS=1 -j $(nproc)
+
+ - name: Build using CMake
+ shell: msys2 {0}
+ run: |
+ cmake -B build
+ cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
+ - name: Clean after building using CMake
+ shell: msys2 {0}
+ run: |
+ rm -rf build
+
+ - name: Build using CMake w/ OpenBLAS
+ shell: msys2 {0}
+ run: |
+ cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+ cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
windows-latest-cmake:
runs-on: windows-latest
@@ -938,6 +1030,12 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
+ with:
+ path: ./artifact
+
+ - name: Move artifacts
+ id: move_artifacts
+ run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
@@ -956,7 +1054,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
- for (let file of await fs.readdirSync('./artifact')) {
+ for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@@ -964,7 +1062,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
- data: await fs.readFileSync(`./artifact/${file}`)
+ data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index eefd87878..9b03d19bc 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -91,6 +91,12 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
+ - name: Downcase github.repository_owner
+ run: |
+ echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+ env:
+ GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -98,7 +104,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+ tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -107,5 +113,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+ tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index f4ae65495..5be17f157 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,4 +21,4 @@ jobs:
uses: py-actions/flake8@v2
with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
- exclude: "examples/*,examples/*/**,*/**/__init__.py"
+ exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 521cc29ae..79cd7d643 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -23,7 +23,7 @@ on:
- cron: '2 4 * * *'
concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
@@ -41,23 +41,16 @@ jobs:
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
- container:
- image: ubuntu:latest
- ports:
- - 8888
- options: --cpus 4
-
steps:
- name: Dependencies
id: depends
run: |
- apt-get update
- apt-get -y install \
+ sudo apt-get update
+ sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
- python3-pip \
curl \
wget \
language-pack-en \
@@ -70,6 +63,17 @@ jobs:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+ - name: Python setup
+ id: setup_python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Tests dependencies
+ id: test_dependencies
+ run: |
+ pip install -r examples/server/tests/requirements.txt
+
- name: Verify server deps
id: verify_server_deps
run: |
@@ -100,10 +104,6 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- - name: Tests dependencies
- id: test_dependencies
- run: |
- pip install -r examples/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
@@ -129,6 +129,7 @@ jobs:
uses: actions/checkout@v4
with:
fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL
id: get_libcurl
diff --git a/.gitignore b/.gitignore
index 9fb5b80c3..60f9d1f8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ lcov-report/
gcovr-report/
build*
+!build.zig
cmake-build-*
out/
tmp/
@@ -48,6 +49,7 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
+/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
@@ -99,7 +101,25 @@ qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
poetry.lock
poetry.toml
nppBackup
+
+# Test binaries
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-sampling
+/tests/test-tokenizer-0
+/tests/test-tokenizer-1-spm
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
+/tests/test-backend-ops
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19fdfa46c..477c5b57c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,8 @@ else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
+set(LLAMA_LLAMAFILE_DEFAULT ON)
+
# general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
@@ -88,6 +90,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
+option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
@@ -286,6 +289,7 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK}
)
endif()
+
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@@ -368,6 +372,13 @@ if (LLAMA_BLAS)
endif()
endif()
+if (LLAMA_LLAMAFILE)
+ add_compile_definitions(GGML_USE_LLAMAFILE)
+
+ set(GGML_HEADERS_LLAMAFILE sgemm.h)
+ set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+endif()
+
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
@@ -1151,15 +1162,16 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
- ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
- ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
- ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
- ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
- ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
- ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
- ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
- ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
- ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
+ ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+ ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
+ ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+ ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+ ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+ ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
+ ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
+ ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
+ ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
+ ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/Makefile b/Makefile
index 11b31c5c8..0a73f2a58 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,28 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
- tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
- tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
- tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
- tests/test-json-schema-to-grammar tests/test-grammar-integration
+ tests/test-autorelease \
+ tests/test-backend-ops \
+ tests/test-double-float \
+ tests/test-grad0 \
+ tests/test-grammar-integration \
+ tests/test-grammar-parser \
+ tests/test-json-schema-to-grammar \
+ tests/test-llama-grammar \
+ tests/test-model-load-cancel \
+ tests/test-opt \
+ tests/test-quantize-fns \
+ tests/test-quantize-perf \
+ tests/test-rope \
+ tests/test-sampling \
+ tests/test-tokenizer-0 \
+ tests/test-tokenizer-1-bpe \
+ tests/test-tokenizer-1-spm
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
+# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
+
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@failures=0; \
for test_target in $(TEST_TARGETS); do \
- if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
- ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
- elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
- elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
continue; \
@@ -384,6 +413,11 @@ ifdef LLAMA_OPENBLAS
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS
+ifndef LLAMA_NO_LLAMAFILE
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+ OBJS += sgemm.o
+endif
+
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -480,11 +514,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
-
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -603,6 +635,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
+ifndef LLAMA_NO_LLAMAFILE
+sgemm.o: sgemm.cpp sgemm.h ggml.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
GF_CC := $(CC)
include scripts/get-flags.mk
@@ -646,7 +683,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -687,8 +724,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,11 +793,11 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -788,10 +825,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+ echo "unsigned char $${NAME}[] = {" && \
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+ echo "};" && \
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+ ) > $@
+
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -800,6 +846,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -950,11 +1000,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -962,7 +1008,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/Package.swift b/Package.swift
index 8b7195869..183e64757 100644
--- a/Package.swift
+++ b/Package.swift
@@ -2,6 +2,45 @@
import PackageDescription
+var sources = [
+ "ggml.c",
+ "sgemm.cpp",
+ "llama.cpp",
+ "unicode.cpp",
+ "unicode-data.cpp",
+ "ggml-alloc.c",
+ "ggml-backend.c",
+ "ggml-quants.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] = [
+ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+ .unsafeFlags(["-fno-objc-arc"]),
+ // NOTE: NEW_LAPACK will required iOS version 16.4+
+ // We should consider add this in the future when we drop support for iOS 14
+ // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+ // .define("ACCELERATE_NEW_LAPACK"),
+ // .define("ACCELERATE_LAPACK_ILP64")
+]
+
+#if canImport(Darwin)
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+ contentsOf: [
+ .define("GGML_USE_ACCELERATE"),
+ .define("GGML_USE_METAL")
+ ]
+)
+#endif
+
+#if os(Linux)
+ cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
let package = Package(
name: "llama",
platforms: [
@@ -28,34 +67,11 @@ let package = Package(
"ggml-cuda.h",
"Makefile"
],
- sources: [
- "ggml.c",
- "llama.cpp",
- "unicode.cpp",
- "unicode-data.cpp",
- "ggml-alloc.c",
- "ggml-backend.c",
- "ggml-quants.c",
- "ggml-metal.m",
- ],
- resources: [
- .process("ggml-metal.metal")
- ],
+ sources: sources,
+ resources: resources,
publicHeadersPath: "spm-headers",
- cSettings: [
- .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
- .define("GGML_USE_ACCELERATE"),
- .unsafeFlags(["-fno-objc-arc"]),
- .define("GGML_USE_METAL"),
- // NOTE: NEW_LAPACK will required iOS version 16.4+
- // We should consider add this in the future when we drop support for iOS 14
- // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
- // .define("ACCELERATE_NEW_LAPACK"),
- // .define("ACCELERATE_LAPACK_ILP64")
- ],
- linkerSettings: [
- .linkedFramework("Accelerate")
- ]
+ cSettings: cSettings,
+ linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
diff --git a/README-sycl.md b/README-sycl.md
index 4372a32e3..dc98c7b3e 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -8,9 +8,9 @@
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
-- [Known Issue](#known-issue)
-- [Q&A](#q&a)
-- [Todo](#todo)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)
## Background
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
+| OS | Status | Verified |
+|---------|---------|------------------------------------|
+| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11 |
## Hardware
@@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
+| Intel GPU | Status | Verified Model |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series | Support | Max 1550, 1100 |
+| Intel Data Center Flex Series | Support | Flex 170 |
+| Intel Arc Series | Support | Arc 770, 730M |
+| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
+| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
@@ -84,24 +84,18 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
- **Execution Unit (EU)**
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
-### Nvidia GPU
-The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+### Other Vendor GPU
**Verified devices**
-|Nvidia GPU| Status | Verified Model|
-|-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
-
-*Notes:*
- - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
-
- - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
-
+| Nvidia GPU | Status | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series | Support | A100, A4000 |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
+
### Build image
```sh
# Using FP16
@@ -167,30 +161,11 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
-Installation can be verified by running the following:
-```sh
-nvidia-smi
-```
-Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
-```
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
-|-----------------------------------------+----------------------+----------------------+
-| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
-| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
-| | | MIG M. |
-|=========================================+======================+======================|
-| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
-| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
-| | | Disabled |
-+-----------------------------------------+----------------------+----------------------+
-```
-
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
2. **Install Intel® oneAPI Base toolkit**
-- **Base installation**
+- **For Intel GPU**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
@@ -202,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
- **Adding support to Nvidia GPUs**
-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
@@ -237,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
- **Nvidia GPU**
-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -254,12 +229,14 @@ source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
-# Option 1: Use FP16 for better performance in long-prompt inference
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-# Or without "--build", run "make" next
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+# Option 2: Use FP16
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+#build all binary
+cmake --build . --config Release -j -v
```
#### Nvidia GPU
@@ -273,11 +250,15 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
-# Option 1: Use FP16 for better performance in long-prompt inference
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+#build all binary
+cmake --build . --config Release -j -v
-# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
### III. Run the inference
@@ -313,10 +294,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
@@ -325,10 +306,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -357,7 +338,6 @@ Otherwise, you can run the script:
*Notes:*
-- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -436,9 +416,13 @@ mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
+
+# Option 2: Or FP16
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-make
+make -j
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -486,10 +470,10 @@ found 6 SYCL devices:
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
@@ -499,10 +483,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -525,7 +509,6 @@ Otherwise, run the following wrapper script:
Note:
-- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -540,29 +523,23 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+| Name | Value | Function |
+|--------------------|-----------------------------------|---------------------------------------------|
+| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
+| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
+| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
+| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
+| Name | Value | Function |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
## Known Issues
-- Hanging during startup
-
- llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
-
- - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
-
- `Split-mode:[row]` is not supported.
## Q&A
@@ -574,7 +551,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
- General compiler error:
- - Remove build folder or try a clean-build.
+ - Remove **build** folder or try a clean-build.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
@@ -591,6 +568,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
-## Todo
+## TODO
- Support row layer split for multiple card runs.
diff --git a/README.md b/README.md
index b60662387..a2aa9214f 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes
+- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@@ -19,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
-- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
+- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
+- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
@@ -92,9 +94,11 @@ Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
+- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
-- [X] Falcon
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -117,10 +121,14 @@ Typically finetunes of the base models below are supported as well.
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
-- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)
+
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
**Multimodal models:**
@@ -130,6 +138,8 @@ Typically finetunes of the base models below are supported as well.
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
+- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
+- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server**
@@ -185,6 +195,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@@ -483,20 +495,20 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
- | Option | Legal values | Default | Description |
- |--------------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
- | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+ | Option | Legal values | Default | Description |
+ |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
- You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+ You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
@@ -532,18 +544,18 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
- | Option | Legal values | Default | Description |
- |-------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | Option | Legal values | Default | Description |
+ |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+ - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
@@ -568,6 +580,12 @@ Building the program with BLAS support may lead to some performance improvements
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
+ Linux packaging:
+ Fedora Linux:
+ ```bash
+ sudo dnf install clblast
+ ```
+
Alternatively, they may be built from source.
-
@@ -744,11 +762,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-| 7B | 13 GB | 3.9 GB |
-| 13B | 24 GB | 7.8 GB |
-| 30B | 60 GB | 19.5 GB |
-| 65B | 120 GB | 38.5 GB |
+|------:|--------------:|----------------------:|
+| 7B | 13 GB | 3.9 GB |
+| 13B | 24 GB | 7.8 GB |
+| 30B | 60 GB | 19.5 GB |
+| 65B | 120 GB | 38.5 GB |
### Quantization
@@ -756,7 +774,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
-| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
+| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
@@ -1104,7 +1122,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
-- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)
### Docs
diff --git a/SECURITY.md b/SECURITY.md
index 14504b1bf..f4322c6ee 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
diff --git a/build.zig b/build.zig
index 7f36e5968..96783574f 100644
--- a/build.zig
+++ b/build.zig
@@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
+ const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -128,15 +129,44 @@ pub fn build(b: *std.build.Builder) !void {
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
- _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
- _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
- _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+ _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+ _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+ _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+ const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
+
+ const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+ for (server_assets) |asset| {
+ const input_path = b.fmt("examples/server/public/{s}", .{asset});
+ const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+ // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+ const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+ defer b.allocator.free(input);
+
+ var buf = std.ArrayList(u8).init(b.allocator);
+ defer buf.deinit();
+
+ for (input) |byte| {
+ try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+ }
+
+ var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+ defer b.allocator.free(name);
+ std.mem.replaceScalar(u8, name, '.', '_');
+
+ try std.fs.cwd().writeFile(output_path, b.fmt(
+ "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+ .{ name, buf.items, name, input.len },
+ ));
+
+ std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+ }
}
diff --git a/ci/run.sh b/ci/run.sh
index 19776b5f7..da05f0d48 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -153,6 +153,55 @@ function gg_sum_ctest_release {
gg_printf '```\n'
}
+# test_scripts_debug
+
+function gg_run_test_scripts_debug {
+ cd ${SRC}
+
+ set -e
+
+ # TODO: too slow, run on dedicated node
+ (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+ #(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+ set +e
+}
+
+function gg_sum_test_scripts_debug {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs test scripts in debug mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+ gg_printf '```\n'
+ gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
+ cd ${SRC}
+
+ set -e
+
+ (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+ (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+ set +e
+}
+
+function gg_sum_test_scripts_release {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs test scripts in release mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+ gg_printf '```\n'
+ gg_printf '\n'
+}
+
function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -642,6 +691,9 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
+ test $ret -eq 0 && gg_run test_scripts_debug
+ test $ret -eq 0 && gg_run test_scripts_release
+
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 1d840e5f7..0ec8d6d8d 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
-set(TARGET json-schema-to-grammar)
-add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
-
set(TARGET common)
add_library(${TARGET} STATIC
@@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
grammar-parser.h
grammar-parser.cpp
json.hpp
+ json-schema-to-grammar.cpp
train.h
train.cpp
ngram-cache.h
diff --git a/common/common.cpp b/common/common.cpp
index 98fc8388c..aa494291d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,6 @@
#include "common.h"
+#include "json.hpp"
+#include "json-schema-to-grammar.h"
#include "llama.h"
#include
@@ -68,6 +70,8 @@
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL
+using json = nlohmann::ordered_json;
+
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -104,6 +108,79 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+ __asm__("movq\t%%rbx,%%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx,%%rsi"
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+ : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+ cpu_set_t mask;
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+ unsigned eax, ebx, ecx, edx;
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+ return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+ unsigned eax, ebx, ecx, edx;
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+ int intel_atom = 0x20;
+ int core_type = (eax & 0xff000000u) >> 24;
+ return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+ int result = 0;
+ for (int cpu = 0; cpu < cpu_count; ++cpu) {
+ if (pin_cpu(cpu)) {
+ return -1;
+ }
+ if (is_running_on_efficiency_core()) {
+ continue; // efficiency cores harm lockstep threading
+ }
+ ++cpu; // hyperthreading isn't useful for linear algebra
+ ++result;
+ }
+ return result;
+}
+
+#endif // __x86_64__ && __linux__
+
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int get_math_cpu_count() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+ int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
+ if (cpu_count < 1) {
+ return get_num_physical_cores();
+ }
+ if (is_hybrid_cpu()) {
+ cpu_set_t affinity;
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+ int result = count_math_cpus(cpu_count);
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+ if (result > 0) {
+ return result;
+ }
+ }
+ }
+#endif
+ return get_num_physical_cores();
+}
+
void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -157,15 +234,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result;
}
+bool parse_kv_override(const char * data, std::vector & overrides) {
+ const char * sep = strchr(data, '=');
+ if (sep == nullptr || sep - data >= 128) {
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+ return false;
+ }
+ llama_model_kv_override kvo;
+ std::strncpy(kvo.key, data, sep - data);
+ kvo.key[sep - data] = 0;
+ sep++;
+ if (strncmp(sep, "int:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.val_i64 = std::atol(sep);
+ } else if (strncmp(sep, "float:", 6) == 0) {
+ sep += 6;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+ kvo.val_f64 = std::atof(sep);
+ } else if (strncmp(sep, "bool:", 5) == 0) {
+ sep += 5;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+ if (std::strcmp(sep, "true") == 0) {
+ kvo.val_bool = true;
+ } else if (std::strcmp(sep, "false") == 0) {
+ kvo.val_bool = false;
+ } else {
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ } else if (strncmp(sep, "str:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+ if (strlen(sep) > 127) {
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+ return false;
+ }
+ strncpy(kvo.val_str, sep, 127);
+ kvo.val_str[127] = '\0';
+ } else {
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ overrides.emplace_back(std::move(kvo));
+ return true;
+}
+
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
- llama_sampling_params& sparams = params.sparams;
+ llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_param = true;
return true;
}
+ // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
+ sparams.seed = std::stoul(argv[i]);
return true;
}
if (arg == "-t" || arg == "--threads") {
@@ -1010,6 +1135,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_print = std::stoi(argv[i]);
return true;
}
+ if (arg == "--check-tensors") {
+ params.check_tensors = true;
+ return true;
+ }
if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
@@ -1148,52 +1277,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
);
return true;
}
+ if (arg == "-j" || arg == "--json-schema") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
+ return true;
+ }
if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
return true;
}
- char* sep = strchr(argv[i], '=');
- if (sep == nullptr || sep - argv[i] >= 128) {
- fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
- invalid_param = true;
- return true;
- }
- struct llama_model_kv_override kvo;
- std::strncpy(kvo.key, argv[i], sep - argv[i]);
- kvo.key[sep - argv[i]] = 0;
- sep++;
- if (strncmp(sep, "int:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
- kvo.int_value = std::atol(sep);
- }
- else if (strncmp(sep, "float:", 6) == 0) {
- sep += 6;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
- kvo.float_value = std::atof(sep);
- }
- else if (strncmp(sep, "bool:", 5) == 0) {
- sep += 5;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
- if (std::strcmp(sep, "true") == 0) {
- kvo.bool_value = true;
- }
- else if (std::strcmp(sep, "false") == 0) {
- kvo.bool_value = false;
- }
- else {
- fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
- invalid_param = true;
- return true;
- }
- }
- else {
+ if (!parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
- params.kv_overrides.push_back(kvo);
return true;
}
#ifndef LOG_DISABLE_LOGS
@@ -1353,6 +1454,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
printf(" --grammar-file FNAME file to read grammar from\n");
+ printf(" -j SCHEMA, --json-schema SCHEMA\n");
+ printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
+ printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
printf(" --cfg-negative-prompt PROMPT\n");
printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n");
@@ -1461,9 +1565,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+ printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print);
+ printf(" --check-tensors check model tensor data for invalid values\n");
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
@@ -1588,6 +1693,18 @@ std::vector string_split(std::string input, char separator) {
return parts;
}
+std::string string_strip(const std::string & str) {
+ size_t start = 0;
+ size_t end = str.size();
+ while (start < end && std::isspace(str[start])) {
+ start++;
+ }
+ while (end > start && std::isspace(str[end - 1])) {
+ end--;
+ }
+ return str.substr(start, end - start);
+}
+
std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) {
std::unordered_map sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
@@ -1684,6 +1801,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
+ mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
@@ -1745,6 +1863,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
+ cparams.cb_eval = params.cb_eval;
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2192,7 +2312,7 @@ std::tuple llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
- {
+ if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
@@ -2236,12 +2356,12 @@ std::vector llama_tokenize(
return result;
}
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector result(8, 0);
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
if (n_tokens < 0) {
result.resize(-n_tokens);
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index a7f476c1b..eea63a114 100644
--- a/common/common.h
+++ b/common/common.h
@@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
+int get_math_cpu_count();
int32_t get_num_physical_cores();
//
@@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
- int32_t n_threads = get_num_physical_cores();
+ int32_t n_threads = get_math_cpu_count();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
@@ -80,10 +81,13 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
+ void * cb_eval_user_data = nullptr;
+
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
- llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
- llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@@ -156,6 +160,8 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
+ bool warmup = true; // warmup run
+ bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
@@ -165,6 +171,8 @@ struct gpt_params {
std::string image = ""; // path to an image file
};
+bool parse_kv_override(const char * data, std::vector & overrides);
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -188,6 +196,7 @@ bool validate_file_name(const std::string & filename);
std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names);
std::vector sampler_types_from_chars(const std::string & names_string);
std::vector string_split(std::string input, char separator);
+std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
//
@@ -232,11 +241,12 @@ std::vector llama_tokenize(
bool add_special,
bool parse_special = false);
-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
const struct llama_context * ctx,
- llama_token token);
+ llama_token token,
+ bool special = true);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 0e4680346..0f8f1b1d4 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,35 +11,101 @@
using json = nlohmann::ordered_json;
+template
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+ if (separator_rule.empty()) {
+ if (min_items == 0 && max_items == 1) {
+ return item_rule + "?";
+ } else if (min_items == 1 && max_items == std::numeric_limits::max()) {
+ return item_rule + "+";
+ }
+ }
+
+ std::string result;
+ if (min_items > 0) {
+ if (item_rule_is_literal && separator_rule.empty()) {
+ result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+ } else {
+ std::vector items(min_items, item_rule);
+ result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+ }
+ }
+
+ std::function opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+ auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+ if (up_to_n == 0) {
+ return "";
+ } else if (up_to_n == 1) {
+ return "(" + content + ")?";
+ } else if (!separator_rule.empty() && !prefix_with_sep) {
+ return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+ } else {
+ std::string res = repeat("(" + content + " ", up_to_n);
+ // strip trailing space
+ res = res.substr(0, res.length() - 1);
+ res += repeat(")?", up_to_n);
+ return res;
+ }
+ };
+
+ if (min_items > 0 && max_items != min_items) {
+ result += " ";
+ }
+
+ if (max_items != std::numeric_limits::max()) {
+ result += opt_repetitions(max_items - min_items, min_items > 0);
+ } else {
+ std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+ if (min_items == 0 && !separator_rule.empty()) {
+ result = "(" + item_rule + " " + item_operator + "*)?";
+ } else {
+ result += item_operator + "*";
+ }
+ }
+
+ return result;
+}
+
const std::string SPACE_RULE = "\" \"?";
-std::unordered_map PRIMITIVE_RULES = {
- {"boolean", "(\"true\" | \"false\") space"},
- {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
- {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
- {"value", "object | array | string | number | boolean"},
- {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
- {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
- {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
- {"string", " \"\\\"\" (\n"
- " [^\"\\\\] |\n"
- " \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
- " )* \"\\\"\" space"},
- {"null", "\"null\" space"}
+struct BuiltinRule {
+ std::string content;
+ std::vector deps;
};
-std::vector OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
-std::unordered_map DATE_RULES = {
- {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
- {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
- {"date-time", "date \"T\" time"},
- {"date-string", "\"\\\"\" date \"\\\"\" space"},
- {"time-string", "\"\\\"\" time \"\\\"\" space"},
- {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
+std::unordered_map PRIMITIVE_RULES = {
+ {"boolean", {"(\"true\" | \"false\") space", {}}},
+ {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+ {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+ {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+ {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+ {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+ {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+ {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+ {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+ {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+ {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+ {"null", {"\"null\" space", {}}},
+};
+
+std::unordered_map STRING_FORMAT_RULES = {
+ {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+ {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+ {"date-time", {"date \"T\" time", {"date", "time"}}},
+ {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+ {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+ {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
};
static bool is_reserved_name(const std::string & name) {
@@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
- for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+ for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -192,7 +258,7 @@ private:
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
- rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+ rule = "[^\\x0A\\x0D]";
}
return _add_rule("dot", rule);
};
@@ -308,47 +374,21 @@ private:
auto &sub = last.first;
auto sub_is_literal = last.second;
- if (min_times == 0 && max_times == std::numeric_limits::max()) {
- sub += "*";
- } else if (min_times == 0 && max_times == 1) {
- sub += "?";
- } else if (min_times == 1 && max_times == std::numeric_limits::max()) {
- sub += "+";
- } else {
- if (!sub_is_literal) {
- std::string & sub_id = sub_rule_ids[sub];
- if (sub_id.empty()) {
- sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
- }
- sub = sub_id;
+ if (!sub_is_literal) {
+ std::string & sub_id = sub_rule_ids[sub];
+ if (sub_id.empty()) {
+ sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
- std::string result;
- if (sub_is_literal && min_times > 0) {
- result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
- } else {
- for (int j = 0; j < min_times; j++) {
- if (j > 0) {
- result += " ";
- }
- result += sub;
- }
- }
- if (min_times > 0 && min_times < max_times) {
- result += " ";
- }
- if (max_times == std::numeric_limits::max()) {
- result += sub + "*";
- } else {
- for (int j = min_times; j < max_times; j++) {
- if (j > min_times) {
- result += " ";
- }
- result += sub + "?";
- }
- }
- seq.back().first = result;
- seq.back().second = false;
+ sub = sub_id;
}
+ seq.back().first = build_repetition(
+ sub_is_literal ? "\"" + sub + "\"" : sub,
+ min_times,
+ max_times,
+ "",
+ sub_is_literal
+ );
+ seq.back().second = false;
} else {
std::string literal;
auto is_non_literal = [&](char c) {
@@ -424,7 +464,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
- std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+ std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -486,6 +526,25 @@ private:
return rule;
}
+ std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+ auto n = _add_rule(name, rule.content);
+ for (const auto & dep : rule.deps) {
+ BuiltinRule dep_rule;
+ auto it = PRIMITIVE_RULES.find(dep);
+ if (it == PRIMITIVE_RULES.end()) {
+ it = STRING_FORMAT_RULES.find(dep);
+ if (it == STRING_FORMAT_RULES.end()) {
+ _errors.push_back("Rule " + dep + " not known");
+ continue;
+ }
+ }
+ if (_rules.find(dep) == _rules.end()) {
+ _add_primitive(dep, it->second);
+ }
+ }
+ return n;
+ }
+
public:
SchemaConverter(
const std::function & fetch_json,
@@ -647,49 +706,33 @@ public:
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
- std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
- std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
- int max_items = max_items_json.is_number_integer() ? max_items_json.get() : -1;
- if (min_items > 0) {
- successive_items += repeat(list_item_operator, min_items - 1);
- min_items--;
- }
- if (max_items >= 0 && max_items > min_items) {
- successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
- } else {
- successive_items += list_item_operator + "*";
- }
- std::string rule;
- if (min_items == 0) {
- rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
- } else {
- rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
- }
- return _add_rule(rule_name, rule);
+ int max_items = max_items_json.is_number_integer() ? max_items_json.get() : std::numeric_limits::max();
+
+ return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
- return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
- } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
- for (const auto & kv : DATE_RULES) {
- _add_rule(kv.first, kv.second);
- }
- return schema_format + "-string";
+ return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+ } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+ auto prim_name = schema_format + "-string";
+ return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+ } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+ std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+ int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0;
+ int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max();
+ return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema.empty() || schema_type == "object") {
- for (const auto & n : OBJECT_RULE_NAMES) {
- _add_rule(n, PRIMITIVE_RULES.at(n));
- }
- return _add_rule(rule_name, "object");
+ return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
- return _add_rule(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get()));
+ return _add_primitive(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get()));
}
}
diff --git a/common/log.h b/common/log.h
index e4edcac7d..2b2f0e455 100644
--- a/common/log.h
+++ b/common/log.h
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 45d68b26c..cc83600d9 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,4 +1,6 @@
+#define LLAMA_API_INTERNAL
#include "sampling.h"
+#include
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
@@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev);
+ llama_sampling_set_rng_seed(result, params.seed);
+
return result;
}
@@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
ctx->cur.clear();
}
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+ if (seed == LLAMA_DEFAULT_SEED) {
+ seed = std::random_device{}();
+ }
+ ctx->rng.seed(seed);
+}
+
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
@@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(
sampler_queue(ctx_main, params, cur_p, min_keep);
- id = llama_sample_token(ctx_main, &cur_p);
+ id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
//{
// const int n_top = 10;
diff --git a/common/sampling.h b/common/sampling.h
index 639b819ab..cf7081e36 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -4,9 +4,10 @@
#include "grammar-parser.h"
+#include
#include
-#include
#include
+#include
// sampler types
enum class llama_sampler_type : char {
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
// sampling parameters
typedef struct llama_sampling_params {
- int32_t n_prev = 64; // number of previous tokens to remember
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
- int32_t top_k = 40; // <= 0 to use vocab size
- float top_p = 0.95f; // 1.0 = disabled
- float min_p = 0.05f; // 0.0 = disabled
- float tfs_z = 1.00f; // 1.0 = disabled
- float typical_p = 1.00f; // 1.0 = disabled
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
- float dynatemp_range = 0.00f; // 0.0 = disabled
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
- float penalty_repeat = 1.00f; // 1.0 = disabled
- float penalty_freq = 0.00f; // 0.0 = disabled
- float penalty_present = 0.00f; // 0.0 = disabled
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
- float mirostat_tau = 5.00f; // target entropy
- float mirostat_eta = 0.10f; // learning rate
- bool penalize_nl = false; // consider newlines as a repeatable token
+ int32_t n_prev = 64; // number of previous tokens to remember
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
+ int32_t top_k = 40; // <= 0 to use vocab size
+ float top_p = 0.95f; // 1.0 = disabled
+ float min_p = 0.05f; // 0.0 = disabled
+ float tfs_z = 1.00f; // 1.0 = disabled
+ float typical_p = 1.00f; // 1.0 = disabled
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+ float dynatemp_range = 0.00f; // 0.0 = disabled
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
+ float penalty_repeat = 1.00f; // 1.0 = disabled
+ float penalty_freq = 0.00f; // 0.0 = disabled
+ float penalty_present = 0.00f; // 0.0 = disabled
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+ float mirostat_tau = 5.00f; // target entropy
+ float mirostat_eta = 0.10f; // learning rate
+ bool penalize_nl = false; // consider newlines as a repeatable token
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector samplers_sequence = {
llama_sampler_type::TOP_K,
@@ -79,6 +81,8 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer
std::vector prev;
std::vector cur;
+
+ std::mt19937 rng;
};
#include "common.h"
@@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
new file mode 100644
index 000000000..1c559c3f6
--- /dev/null
+++ b/convert-hf-to-gguf-update.py
@@ -0,0 +1,275 @@
+# This script downloads the tokenizer models of the specified models from Huggingface and
+# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
+#
+# This is necessary in order to analyze the type of pre-tokenizer used by the model and
+# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# the same pre-tokenizer.
+#
+# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+#
+# Instructions:
+#
+# - Add a new model to the "models" list
+# - Run the script with your huggingface token:
+#
+# python3 convert-hf-to-gguf-update.py
+#
+# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
+# - Update llama.cpp with the new pre-tokenizer if necessary
+#
+# TODO: generate tokenizer tests for llama.cpp
+# TODO: automate the update of convert-hf-to-gguf.py
+#
+
+import os
+import requests
+import sys
+import json
+
+from hashlib import sha256
+from enum import IntEnum, auto
+
+class TOKENIZER_TYPE(IntEnum):
+ SPM = auto()
+ BPE = auto()
+ WPM = auto()
+
+# TODO: this string has to exercise as much pre-tokenizer functionality as possible
+# will be updated with time - contributions welcome
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+if len(sys.argv) == 2:
+ token = sys.argv[1]
+else:
+ print("Usage: python convert-hf-to-gguf-update.py ")
+ sys.exit(1)
+
+# TODO: add models here, base models preferred
+models = [
+ { "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+ { "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+ { "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+ { "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+ { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+ { "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+ { "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+ { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+ { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+ { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+ ]
+
+# make directory "models/tokenizers" if it doesn't exist
+if not os.path.exists("models/tokenizers"):
+ os.makedirs("models/tokenizers")
+
+def download_file_with_auth(url, token, save_path):
+ headers = {"Authorization": f"Bearer {token}"}
+ response = requests.get(url, headers=headers)
+ if response.status_code == 200:
+ with open(save_path, 'wb') as f:
+ f.write(response.content)
+ print(f"File {save_path} downloaded successfully")
+ else:
+ print(f"Failed to download file. Status code: {response.status_code}")
+
+# download the tokenizer models
+for model in models:
+ name = model["name"]
+ repo = model["repo"]
+ tokt = model["tokt"]
+
+ if not os.path.exists(f"models/tokenizers/{name}"):
+ os.makedirs(f"models/tokenizers/{name}")
+ else:
+ print(f"Directory models/tokenizers/{name} already exists - skipping")
+ continue
+
+ print(f"Downloading {name} to models/tokenizers/{name}")
+
+ url = f"{repo}/raw/main/config.json"
+ save_path = f"models/tokenizers/{name}/config.json"
+ download_file_with_auth(url, token, save_path)
+
+ url = f"{repo}/raw/main/tokenizer.json"
+ save_path = f"models/tokenizers/{name}/tokenizer.json"
+ download_file_with_auth(url, token, save_path)
+
+ if tokt == TOKENIZER_TYPE.SPM:
+ url = f"{repo}/resolve/main/tokenizer.model"
+ save_path = f"models/tokenizers/{name}/tokenizer.model"
+ download_file_with_auth(url, token, save_path)
+
+ url = f"{repo}/raw/main/tokenizer_config.json"
+ save_path = f"models/tokenizers/{name}/tokenizer_config.json"
+ download_file_with_auth(url, token, save_path)
+
+# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# TODO: auto-update convert-hf-to-gguf.py with the generated function
+
+src_ifs = ""
+for model in models:
+ name = model["name"]
+ tokt = model["tokt"]
+
+ if tokt == TOKENIZER_TYPE.SPM:
+ continue
+
+ # create the tokenizer
+ from transformers import AutoTokenizer
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+ chktok = tokenizer.encode(chktxt)
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+ print(f"model: {name}")
+ print(f"tokt: {tokt}")
+ print(f"repo: {model['repo']}")
+ print(f"chktok: {chktok}")
+ print(f"chkhsh: {chkhsh}")
+
+ # print the "pre_tokenizer" content from the tokenizer.json
+ with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
+ cfg = json.load(f)
+ pre_tokenizer = cfg["pre_tokenizer"]
+ print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+
+ print(f"\n")
+
+ src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
+ src_ifs += f" # ref: {model['repo']}\n"
+ src_ifs += f" res = \"{name}\"\n"
+
+src_func = ""
+src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
+src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
+src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
+src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
+src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
+src_func += "\n"
+src_func += f" chktxt = {repr(chktxt)}\n"
+src_func += "\n"
+src_func += " chktok = tokenizer.encode(chktxt)\n"
+src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
+src_func += "\n"
+src_func += " print(f\"chktok: {chktok}\")\n"
+src_func += " print(f\"chkhsh: {chkhsh}\")\n"
+src_func += "\n"
+src_func += " res = None\n"
+src_func += "\n"
+src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
+src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
+src_func += f"{src_ifs}\n"
+src_func += " if res is None:\n"
+src_func += " print(\"\\n\")\n"
+src_func += " print(\"**************************************************************************************\")\n"
+src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
+src_func += " print(\"** This means that it was not added yet or you are using an older version.\")\n"
+src_func += " print(\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
+src_func += " print(\"**\")\n"
+src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
+src_func += " print(\"**************************************************************************************\")\n"
+src_func += " print(\"\\n\")\n"
+src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
+src_func += "\n"
+src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
+src_func += " print(f\"chkhsh: {chkhsh}\")\n"
+src_func += "\n"
+src_func += " return res\n"
+
+print(src_func)
+
+print("\n")
+print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+print("\n")
+
+# generate tests for each tokenizer model
+
+tests = [
+ "",
+ " ",
+ " ",
+ " ",
+ "\t",
+ "\n",
+ "\n\n",
+ "\n\n\n",
+ "\t\n",
+ "Hello world",
+ " Hello world",
+ "Hello World",
+ " Hello World",
+ " Hello World!",
+ "Hello, world!",
+ " Hello, world!",
+ " this is 🦙.cpp",
+ "w048 7tuijk dsdfhu",
+ "нещо на Български",
+ "កាន់តែពិសេសអាចខលចេញ",
+ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+ "Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello\n Hello",
+ " (",
+ "\n =",
+ "' era",
+ "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
+ "3",
+ "33",
+ "333",
+ "3333",
+ "33333",
+ "333333",
+ "3333333",
+ "33333333",
+ "333333333",
+ chktxt,
+]
+
+# write the tests to ./models/ggml-vocab-{name}.gguf.inp
+# the format is:
+#
+# test0
+# __ggml_vocab_test__
+# test1
+# __ggml_vocab_test__
+# ...
+#
+
+# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
+# for each test, write the resulting tokens on a separate line
+
+for model in models:
+ name = model["name"]
+ tokt = model["tokt"]
+
+ # create the tokenizer
+ from transformers import AutoTokenizer
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+ with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
+ for text in tests:
+ f.write(f"{text}")
+ f.write("\n__ggml_vocab_test__\n")
+
+ with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
+ for text in tests:
+ res = tokenizer.encode(text, add_special_tokens=False)
+ for r in res:
+ f.write(f" {r}")
+ f.write("\n")
+
+ print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+
+# generate commands for creating vocab files
+
+print("\nRun the following commands to generate the vocab files for testing:\n")
+
+for model in models:
+ name = model["name"]
+
+ print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+
+print("\n")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 63710676b..d1b8cef11 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -11,6 +11,7 @@ import sys
from abc import ABC, abstractmethod
from enum import IntEnum
from pathlib import Path
+from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
import numpy as np
@@ -43,17 +44,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
- def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+ self.use_temp_file = use_temp_file
self.is_safetensors = self._is_model_safetensors()
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
- self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
+ self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@property
@@ -228,7 +230,7 @@ class Model(ABC):
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
# used for GPT-2 BPE and WordPiece vocabs
- def get_basic_vocab(self) -> tuple[list[str], list[int]]:
+ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
tokens: list[str] = []
toktypes: list[int] = []
@@ -237,6 +239,8 @@ class Model(ABC):
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
+ tokpre = self.get_vocab_base_pre(tokenizer)
+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
@@ -254,11 +258,75 @@ class Model(ABC):
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
- return tokens, toktypes
+ return tokens, toktypes, tokpre
+
+ # NOTE: this function is generated by convert-hf-to-gguf-update.py
+ # do not modify it manually!
+ # ref: https://github.com/ggerganov/llama.cpp/pull/6920
+ def get_vocab_base_pre(self, tokenizer) -> str:
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+ # is specific for the BPE pre-tokenizer used by the model
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+ # use in llama.cpp to implement the same pre-tokenizer
+
+ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+ chktok = tokenizer.encode(chktxt)
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+ print(f"chktok: {chktok}")
+ print(f"chkhsh: {chkhsh}")
+
+ res = None
+
+ # NOTE: if you get an error here, you need to add the model to the if-elif chain below
+ # don't do this manually - use the convert-hf-to-gguf-update.py script!
+ if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+ # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+ res = "llama-bpe"
+ if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+ # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+ res = "deepseek-llm"
+ if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+ # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+ res = "deepseek-coder"
+ if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+ # ref: https://huggingface.co/tiiuae/falcon-7b
+ res = "falcon"
+ if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
+ # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
+ res = "bert-bge"
+ if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
+ # ref: https://huggingface.co/mosaicml/mpt-7b
+ res = "mpt"
+ if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
+ # ref: https://huggingface.co/bigcode/starcoder2-3b
+ res = "starcoder"
+ if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+ # ref: https://huggingface.co/openai-community/gpt2
+ res = "gpt-2"
+
+ if res is None:
+ print("\n")
+ print("**************************************************************************************")
+ print("** WARNING: The BPE pre-tokenizer was not recognized!")
+ print("** This means that it was not added yet or you are using an older version.")
+ print("** Check convert-hf-to-gguf-update.py and update it accordingly.")
+ print("**")
+ print(f"** chkhsh: {chkhsh}")
+ print("**************************************************************************************")
+ print("\n")
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+ print(f"tokenizer.ggml.pre: {res}")
+ print(f"chkhsh: {chkhsh}")
+
+ return res
def _set_vocab_gpt2(self) -> None:
- tokens, toktypes = self.get_basic_vocab()
+ tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
+ self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
@@ -276,6 +344,8 @@ class Model(ABC):
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
+ tokpre = self.get_vocab_base_pre(tokenizer)
+
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
@@ -303,6 +373,7 @@ class Model(ABC):
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
+ self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
@@ -362,9 +433,20 @@ class Model(ABC):
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+ if vocab_size > len(tokens):
+ pad_count = vocab_size - len(tokens)
+ print(
+ f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
+ )
+ for i in range(1, pad_count + 1):
+ tokens.append(f"[PAD{i}]")
+ scores.append(-1000.0)
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
+
assert len(tokens) == vocab_size
self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
@@ -386,6 +468,7 @@ class Model(ABC):
assert len(tokens) == vocab.vocab_size
self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
@@ -829,6 +912,7 @@ class XverseModel(Model):
toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
@@ -1206,9 +1290,91 @@ class StableLMModel(Model):
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_head = self.hparams.get("num_attention_heads")
+ n_kv_head = self.hparams.get("num_key_value_heads")
+ q_norms = dict()
+ k_norms = dict()
+ for name, data_torch in self.get_tensors():
+ # we don't need these
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+ n_dims = len(data.shape)
+ if name.find("q_layernorm.norms") != -1:
+ q_norms[name] = data
+ if len(q_norms) >= (block_count * n_head):
+ self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
+ continue
+ if name.find("k_layernorm.norms") != -1:
+ k_norms[name] = data
+ if len(k_norms) >= (block_count * n_kv_head):
+ self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
+ continue
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
+ for bid in range(block_count):
+ datas = []
+ for xid in range(n_head):
+ ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
+ datas.append(norms[ename])
+ del norms[ename]
+ data = np.stack(datas, axis=0)
+ data_dtype = data.dtype
+ merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class LlamaModel(Model):
@@ -1218,7 +1384,23 @@ class LlamaModel(Model):
try:
self. _set_vocab_sentencepiece()
except FileNotFoundError:
- self._set_vocab_llama_hf()
+ try:
+ self._set_vocab_llama_hf()
+ except (FileNotFoundError, TypeError):
+ # Llama 3
+ self._set_vocab_gpt2()
+
+ # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+ if self.hparams.get("vocab_size", 32000) == 32016:
+ special_vocab = gguf.SpecialVocab(
+ self.dir_model, load_merges=False,
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+ )
+ special_vocab._set_special_token("prefix", 32007)
+ special_vocab._set_special_token("suffix", 32008)
+ special_vocab._set_special_token("middle", 32009)
+ special_vocab._set_special_token("eot", 32010)
+ special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -1226,6 +1408,11 @@ class LlamaModel(Model):
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+ if self.hparams["rope_scaling"].get("type") == "linear":
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
# Same as super class, but permuting q_proj, k_proj
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -1427,6 +1614,102 @@ class GrokModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("DbrxForCausalLM")
+class DbrxModel(Model):
+ model_arch = gguf.MODEL_ARCH.DBRX
+
+ def set_gguf_parameters(self):
+ ffn_config = self.hparams["ffn_config"]
+ attn_config = self.hparams["attn_config"]
+ self.gguf_writer.add_name(self.hparams["model_type"])
+ self.gguf_writer.add_block_count(self.hparams["n_layers"])
+
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+ self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
+
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
+ self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
+
+ self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
+
+ self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
+ self.gguf_writer.add_file_type(self.ftype)
+
+ self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
+ self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
+
+ self.gguf_writer.add_layer_norm_eps(1e-5)
+
+ self.gguf_writer.add_file_type(self.ftype)
+ print(f"gguf: file type = {self.ftype}")
+
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers")
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ for name, data_torch in self.get_tensors():
+ n_expert = self.hparams["ffn_config"]["moe_num_experts"]
+ n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
+ n_embd = self.hparams["d_model"]
+
+ # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+ # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+ # But llama.cpp moe graph works differently
+ # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+ # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
+ exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
+ "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
+ "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
+ experts = False
+ for exp_tensor_name in exp_tensor_names.keys():
+ if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
+ experts = True
+ data_torch = data_torch.view(n_expert, n_ff, n_embd)
+ if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
+ data_torch = data_torch.permute(*permute_tensor)
+ break
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # map tensor names
+ # In MoE models the ffn tensors are typically most of the model weights,
+ # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+ # Every other model has the weight names ending in .weight,
+ # let's assume that is the convention which is not the case for dbrx:
+ # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
+ new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # Most of the codebase that takes in 1D tensors only handles F32 tensors
+ # and most of the outputs tensors are F32.
+ if data_dtype != np.float32 and n_dims == 1:
+ print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
+ sys.exit()
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+
@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1594,6 +1877,111 @@ class QwenModel(Model):
class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2
+ def set_vocab(self):
+ try:
+ self._set_vocab_sentencepiece()
+ except FileNotFoundError:
+ self._set_vocab_gpt2()
+
+
+@Model.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(Model):
+ model_arch = gguf.MODEL_ARCH.QWEN2MOE
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ if (n_experts := self.hparams.get("num_experts")) is not None:
+ self.gguf_writer.add_expert_count(n_experts)
+
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_experts = self.hparams.get("num_experts")
+ experts = dict()
+ for name, data_torch in self.get_tensors():
+ # we don't need these
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # process the experts separately
+ if name.find("experts") != -1:
+ experts[name] = data
+ if len(experts) >= n_experts * 3:
+ # merge the experts into a single 3d tensor
+ for bid in range(block_count):
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
+ full = True
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ if ename not in experts:
+ full = False
+ break
+ if not full:
+ continue
+
+ datas = []
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ datas.append(experts[ename])
+ del experts[ename]
+
+ data = np.stack(datas, axis=0)
+ data_dtype = data.dtype
+
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ if self.ftype == 1 and data_dtype == np.float32:
+ data = data.astype(np.float16)
+
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+ continue
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts.keys()}")
+
@Model.register("GPT2LMHeadModel")
class GPT2Model(Model):
@@ -1685,6 +2073,92 @@ class Phi2Model(Model):
self.gguf_writer.add_add_bos_token(False)
+@Model.register("Phi3ForCausalLM")
+class Phi3MiniModel(Model):
+ model_arch = gguf.MODEL_ARCH.PHI3
+
+ def set_vocab(self):
+ from sentencepiece import SentencePieceProcessor
+
+ tokenizer_path = self.dir_model / 'tokenizer.model'
+
+ if not tokenizer_path.is_file():
+ print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+ sys.exit(1)
+
+ tokenizer = SentencePieceProcessor(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+
+ piece = tokenizer.id_to_piece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.get_score(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.is_unknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.is_control(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.is_unused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.is_byte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ added_tokens_file = self.dir_model / 'added_tokens.json'
+ if added_tokens_file.is_file():
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
+ added_tokens_json = json.load(f)
+
+ for key in added_tokens_json:
+ token_id = added_tokens_json[key]
+ if (token_id >= vocab_size):
+ print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+ continue
+
+ tokens[token_id] = key.encode("utf-8")
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+ self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ def set_gguf_parameters(self):
+ block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
+ rot_pct = 1.0
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
+ rms_eps = self.find_hparam(["rms_norm_eps"])
+
+ self.gguf_writer.add_name("Phi3")
+ self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+
+ self.gguf_writer.add_embedding_length(n_embd)
+ self.gguf_writer.add_feed_forward_length(8192)
+ self.gguf_writer.add_block_count(block_count)
+ self.gguf_writer.add_head_count(n_head)
+ self.gguf_writer.add_head_count_kv(n_head)
+ self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+ self.gguf_writer.add_file_type(self.ftype)
+
+
@Model.register("PlamoForCausalLM")
class PlamoModel(Model):
model_arch = gguf.MODEL_ARCH.PLAMO
@@ -1899,6 +2373,7 @@ class InternLM2Model(Model):
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
@@ -1908,6 +2383,8 @@ class InternLM2Model(Model):
old_eos = special_vocab.special_token_ids["eos"]
if "chat" in os.path.basename(self.dir_model.absolute()):
# For the chat model, we replace the eos with '<|im_end|>'.
+ # TODO: this is a hack, should be fixed
+ # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
in chat mode so that the conversation can end normally.")
@@ -2046,7 +2523,7 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
- tokens, toktypes = self.get_basic_vocab()
+ tokens, toktypes, tokpre = self.get_vocab_base()
self.vocab_size = len(tokens)
# we need this to validate the size of the token_type embeddings
@@ -2064,6 +2541,7 @@ class BertModel(Model):
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
+ self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
@@ -2085,6 +2563,10 @@ class BertModel(Model):
print(f"Can not map tensor {name!r}")
sys.exit()
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
data = data_torch.squeeze().numpy()
n_dims = len(data.shape)
new_dtype: type[np.floating[Any]]
@@ -2144,6 +2626,16 @@ class GemmaModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
+ # TODO: these special tokens should be exported only for the CodeGemma family
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+ special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+ special_vocab._set_special_token("prefix", 67)
+ special_vocab._set_special_token("suffix", 69)
+ special_vocab._set_special_token("middle", 68)
+ special_vocab._set_special_token("fsep", 70)
+ special_vocab._set_special_token("eot", 107)
+ special_vocab.add_to_gguf(self.gguf_writer)
+
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
@@ -2165,6 +2657,12 @@ class GemmaModel(Model):
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+ # To prevent errors, skip loading lm_head.weight.
+ if name == "lm_head.weight":
+ print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+ continue
+
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
@@ -2224,28 +2722,37 @@ class MambaModel(Model):
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
def set_gguf_parameters(self):
- d_model = self.find_hparam(["hidden_size", "d_model"])
- d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+ d_model = self.find_hparam(["hidden_size", "d_model"])
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
- d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
# ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
- dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+ dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2
@@ -2337,6 +2844,66 @@ class CommandR2Model(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+@Model.register("OlmoForCausalLM")
+@Model.register("OLMoForCausalLM")
+class OlmoModel(Model):
+ model_arch = gguf.MODEL_ARCH.OLMO
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ self.gguf_writer.add_layer_norm_eps(1e-5)
+ if "clip_qkv" in self.hparams is not None:
+ self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
+
+ # Same as super class, but permuting q_proj, k_proj
+ # Copied from: LlamaModel
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_head = self.hparams.get("num_attention_heads")
+ n_kv_head = self.hparams.get("num_key_value_heads")
+ for name, data_torch in self.get_tensors():
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.numpy()
+
+ if name.endswith("q_proj.weight"):
+ data = permute(data, n_head, n_head)
+ if name.endswith("k_proj.weight"):
+ data = permute(data, n_head, n_kv_head)
+
+ data = data.squeeze()
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # 1d tensors need to be converted to float32
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+
###### CONVERSION LOGIC ######
@@ -2363,6 +2930,8 @@ def parse_args() -> argparse.Namespace:
"model", type=Path,
help="directory containing model file",
)
+ parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
+ parser.add_argument("--model-name", type=str, default=None, help="name of the model")
return parser.parse_args()
@@ -2406,7 +2975,7 @@ def main() -> None:
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+ model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
print("Set model parameters")
model_instance.set_gguf_parameters()
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
index cd9644fcb..5354b748b 100755
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -281,6 +281,7 @@ class GGMLToGGUF:
def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama')
+ gguf_writer.add_tokenizer_pre('default')
tokens = []
scores = []
toktypes = []
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
index 69be17f94..aba575426 100755
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -99,6 +99,7 @@ def main():
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
+ gguf_writer.add_tokenizer_pre('default')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
diff --git a/convert.py b/convert.py
index e860ac89f..1c700cf6a 100755
--- a/convert.py
+++ b/convert.py
@@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
+ is_llama3 = (
+ tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+ and not tokenizer_model.get('byte_fallback', True)
+ )
+ if is_llama3:
+ raise TypeError('Llama 3 must be converted with BpeVocab')
+
+ if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
@@ -1350,7 +1357,7 @@ def load_some_model(path: Path) -> ModelPlus:
# Be extra-friendly and accept either a file or a directory:
if path.is_dir():
# Check if it's a set of safetensors files first
- globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+ globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try the PyTorch patterns too, with lower priority
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
new file mode 100644
index 000000000..a56b78344
--- /dev/null
+++ b/docs/HOWTO-add-model.md
@@ -0,0 +1,119 @@
+## Add a new model architecture to `llama.cpp`
+
+Adding a model requires few steps:
+
+1. Convert the model to GGUF
+2. Define the model architecture in `llama.cpp`
+3. Build the GGML graph implementation
+
+After following these steps, you can open PR.
+
+Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)
+
+### 1. Convert the model to GGUF
+
+This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+
+The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
+
+The required steps to implement for an HF model are:
+
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+
+```python
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
+ model_arch = gguf.MODEL_ARCH.GROK
+```
+
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+
+Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
+
+Example for `falcon` model:
+```python
+ MODEL_ARCH.FALCON: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_NORM_2,
+ MODEL_TENSOR.ATTN_QKV,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ]
+```
+
+3. Map the original tensor names to the standardize equivalent in GGUF
+
+As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
+
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+
+If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
+
+Example for the normalization tensor in attention layers:
+
+```python
+block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+ # Attention norm
+ MODEL_TENSOR.ATTN_NORM: (
+ "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+ "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
+ "transformer.blocks.{bid}.norm_1", # mpt
+ ...
+ )
+}
+```
+
+`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
+
+Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`
+
+NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
+
+### 2. Define the model architecture in `llama.cpp`
+
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+
+NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
+
+### 3. Build the GGML graph implementation
+
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+
+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+
+## GGUF specification
+
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+
+## Resources
+
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 76496bf06..f421769cc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
+ add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index d75c503d5..dbbd06da5 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
- if new_token_id == llama_token_eos(model) || n_cur == n_len {
+ if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
- let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+ let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
@@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
model,
token,
&result,
- Int32(result.count)
+ Int32(result.count),
+ false
)
assert(check == actualTokensCount)
} else {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 7aaf63ceb..be30d20bf 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
- // is it an end of stream? -> mark the stream as finished
- if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+ // is it an end of generation? -> mark the stream as finished
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 866c6d7a6..3d34378a5 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+ return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
new file mode 100644
index 000000000..c56ba780b
--- /dev/null
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
new file mode 100644
index 000000000..66a37e878
--- /dev/null
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+eval-callback \
+ --hf-repo ggml-org/models \
+ --hf-file phi-2/ggml-model-q4_0.gguf \
+ --model phi-2-q4_0.gguf \
+ --prompt hello \
+ --seed 42 \
+ -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx = 512
+...
+llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
+llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
+llama_new_context_with_model: graph nodes = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.0181, 0.0272, 0.0272, ...],
+ ],
+ ]
+ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.6989, 1.0636, 1.0636, ...],
+ ],
+ ]
+ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1800, 0.2817, 0.2632, ...],
+ ],
+ ]
+ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1863, 0.2970, 0.2604, ...],
+ ],
+ ]
+ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1238, 1.2876, -1.8086, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+```
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
new file mode 100644
index 000000000..29b5f3b3c
--- /dev/null
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include
+#include
+#include
+#include
+#include
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+ std::vector data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+ std::string str;
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ str += std::to_string(t->ne[i]);
+ if (i + 1 < GGML_MAX_DIMS) {
+ str += ", ";
+ }
+ }
+ return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+ GGML_ASSERT(n > 0);
+ float sum = 0;
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ printf(" [\n");
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+ if (i2 == n && ne[2] > 2*n) {
+ printf(" ..., \n");
+ i2 = ne[2] - n;
+ }
+ printf(" [\n");
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+ if (i1 == n && ne[1] > 2*n) {
+ printf(" ..., \n");
+ i1 = ne[1] - n;
+ }
+ printf(" [");
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+ if (i0 == n && ne[0] > 2*n) {
+ printf("..., ");
+ i0 = ne[0] - n;
+ }
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+ float v;
+ if (type == GGML_TYPE_F16) {
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+ } else if (type == GGML_TYPE_F32) {
+ v = *(float *) data + i;
+ } else if (type == GGML_TYPE_I32) {
+ v = (float) *(int32_t *) data + i;
+ } else if (type == GGML_TYPE_I16) {
+ v = (float) *(int16_t *) data + i;
+ } else if (type == GGML_TYPE_I8) {
+ v = (float) *(int8_t *) data + i;
+ } else {
+ GGML_ASSERT(false);
+ }
+ printf("%12.4f", v);
+ sum += v;
+ if (i0 < ne[0] - 1) printf(", ");
+ }
+ printf("],\n");
+ }
+ printf(" ],\n");
+ }
+ printf(" ]\n");
+ printf(" sum = %f\n", sum);
+ }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ * see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+ auto * cb_data = (callback_data *) user_data;
+
+ const struct ggml_tensor * src0 = t->src[0];
+ const struct ggml_tensor * src1 = t->src[1];
+
+ if (ask) {
+ return true; // Always retrieve data
+ }
+
+ char src1_str[128] = {0};
+ if (src1) {
+ sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+ }
+
+ printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
+ src0->name, ggml_ne_string(src0).c_str(),
+ src1 ? src1_str : "",
+ ggml_ne_string(t).c_str());
+
+
+ // copy the data from the GPU memory if needed
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+ if (!is_host) {
+ auto n_bytes = ggml_nbytes(t);
+ cb_data->data.resize(n_bytes);
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+ }
+
+ if (!ggml_is_quantized(t->type)) {
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+ ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+ }
+
+ return true;
+}
+
+static bool run(llama_context * ctx, const gpt_params & params) {
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+ std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return false;
+ }
+
+ return true;
+}
+
+int main(int argc, char ** argv) {
+
+ callback_data cb_data;
+
+ gpt_params params;
+ if (!gpt_params_parse(argc, argv, params)) {
+ return 1;
+ }
+
+ print_build_info();
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_backend_init();
+ llama_numa_init(params.numa);
+
+ // pass the callback to the backend scheduler
+ // it will be executed for each node during the graph computation
+ params.cb_eval = ggml_debug;
+ params.cb_eval_user_data = &cb_data;
+ params.warmup = false;
+
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
+ return 1;
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ }
+
+ bool OK = run(ctx, params);
+ if (!OK) {
+ return 1;
+ }
+
+ llama_print_timings(ctx);
+
+ llama_free(ctx);
+ llama_free_model(model);
+
+ llama_backend_free();
+
+ return 0;
+}
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index e4c0c1689..091069ffa 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md
index ddb1f7649..ad1d86651 100644
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 24acbf02a..39c75e0a7 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = n * 1024 * 1024; // megabytes
+ n_bytes = (size_t)n * 1024 * 1024; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+ n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
new file mode 100755
index 000000000..57588204d
--- /dev/null
+++ b/examples/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+ echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
+ echo "example: $0 ../../build/bin ../../tmp"
+ exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+ TMP_DIR=$2
+else
+ TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/gguf-split
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensor in metadata
+#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+#echo PASS
+#echo
+
+# 4b. Test the sharded model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 5444503a5..575143771 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
}
// read and create ggml_context containing the tensors and their data
-static bool gguf_ex_read_1(const std::string & fname) {
+static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
printf("\n\n");
// check data
- {
+ if (check_data) {
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
@@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) {
if (argc < 3) {
- printf("usage: %s data.gguf r|w\n", argv[0]);
+ printf("usage: %s data.gguf r|w [n]\n", argv[0]);
+ printf("r: read data.gguf file\n");
+ printf("w: write data.gguf file\n");
+ printf("n: no check of tensor data\n");
return -1;
}
+ bool check_data = true;
+ if (argc == 4) {
+ check_data = false;
+ }
const std::string fname(argv[1]);
const std::string mode (argv[2]);
@@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
- GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+ GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
}
return 0;
diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
index 64cc19204..a3a3c1389 100644
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -21,12 +21,12 @@ not have to be performed at all.
### Running the example
Download a Grit model:
```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
```
Run the example using the downloaded model:
```console
-$ ./gritlm -m gritlm-7b_q4_1.gguf
+$ ./gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1bf55f90c..71e7a727f 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@ struct Stats {
};
struct StatParams {
+ std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
@@ -44,9 +45,9 @@ private:
std::mutex m_mutex;
int m_last_call = 0;
std::vector m_src1_data;
- std::vector m_ids; // the expert ids from ggml_mul_mat_id
+ std::vector m_ids; // the expert ids from ggml_mul_mat_id
//
- void save_imatrix(const char * file_name) const;
+ void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
};
@@ -81,6 +82,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (ask) {
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
+ // why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
return true;
@@ -101,16 +103,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
- const int idx = ((int32_t *) t->op_params)[0];
+ // ids -> [n_experts_used, n_tokens]
+ // src1 -> [cols, n_expert_used, n_tokens]
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
+ const int n_ids = ids->ne[0];
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
- GGML_ASSERT(ids->ne[1] == src1->ne[1]);
- GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
- m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+
+ GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+ m_ids.resize(ggml_nbytes(ids));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
@@ -120,26 +125,35 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+ if (e.values.empty()) {
+ e.values.resize(src1->ne[0]*n_as, 0);
+ }
+ else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+ fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+ exit(1); //GGML_ASSERT(false);
+ }
+ if (m_params.verbosity > 1) {
+ printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+ }
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
- if (e.values.empty()) {
- e.values.resize(src1->ne[0]*n_as, 0);
- }
- else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
- exit(1); //GGML_ASSERT(false);
- }
- if (m_params.verbosity > 1) {
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
- }
- for (int row = 0; row < (int)src1->ne[1]; ++row) {
- const int excur = m_ids[row*n_as + idx];
- GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
- if (excur != ex) continue;
- const float * x = data + row * src1->ne[0];
- for (int j = 0; j < (int)src1->ne[0]; ++j) {
- e.values[e_start + j] += x[j]*x[j];
+
+ for (int idx = 0; idx < n_ids; ++idx) {
+ for (int row = 0; row < (int)src1->ne[2]; ++row) {
+ const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+ GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+ if (excur != ex) continue;
+
+ const int64_t i11 = idx % src1->ne[1];
+ const int64_t i12 = row;
+ const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+ for (int j = 0; j < (int)src1->ne[0]; ++j) {
+ e.values[e_start + j] += x[j]*x[j];
+ }
}
}
if (e.ncall > m_last_call) {
@@ -186,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
void IMatrixCollector::save_imatrix() const {
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+ save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -194,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
- save_imatrix(file_name.c_str());
+ save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}
-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
- out.write((const char*)&n_entries, sizeof(n_entries));
- for (auto& p : m_stats) {
+ out.write((const char *) &n_entries, sizeof(n_entries));
+ for (const auto & p : m_stats) {
int len = p.first.size();
- out.write((const char*)&len, sizeof(len));
+ out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len);
- out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+ out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
- out.write((const char*)&nval, sizeof(nval));
- if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+ out.write((const char *) &nval, sizeof(nval));
+ if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
}
+
+ // Write the number of call the matrix was computed with
+ out.write((const char *) &m_last_call, sizeof(m_last_call));
+
+ // Write the dataset name at the end of the file to later on specify it in quantize
+ int n_dataset = strlen(dataset);
+ out.write((const char *) &n_dataset, sizeof(n_dataset));
+ out.write(dataset, n_dataset);
+
if (m_params.verbosity > 0) {
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
}
}
@@ -534,6 +557,29 @@ int main(int argc, char ** argv) {
}
}
+ gpt_params params;
+ params.n_batch = 512;
+ if (!gpt_params_parse(args.size(), args.data(), params)) {
+ return 1;
+ }
+
+ params.logits_all = true;
+ params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+ print_build_info();
+
+ if (params.seed == LLAMA_DEFAULT_SEED) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
@@ -572,49 +618,21 @@ int main(int argc, char ** argv) {
}
}
- gpt_params params;
- params.n_batch = 512;
- if (!gpt_params_parse(args.size(), args.data(), params)) {
- return 1;
- }
-
- params.logits_all = true;
- params.n_batch = std::min(params.n_batch, params.n_ctx);
-
- print_build_info();
-
- if (params.seed == LLAMA_DEFAULT_SEED) {
- params.seed = time(NULL);
- }
-
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
-
llama_backend_init();
llama_numa_init(params.numa);
- llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
- llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
- if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
- return 1;
- }
-
- llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
- cparams.cb_eval = ik_collect_imatrix;
- cparams.cb_eval_user_data = NULL;
+ params.cb_eval = ik_collect_imatrix;
+ params.cb_eval_user_data = NULL;
+ params.warmup = false;
- llama_context * ctx = llama_new_context_with_model(model, cparams);
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: unable to create context\n", __func__);
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 8c97f719b..6b076c839 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
### Example
+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index c69dcd06e..afac145f6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -586,7 +586,7 @@ int main(int argc, char ** argv) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
- if(is_interacting && !params.interactive_first) {
+ if (is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
@@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
// LOG_TEE("took new input\n");
is_interacting = false;
}
- // deal with end of text token in interactive mode
- else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+ // deal with end of generation tokens in interactive mode
+ else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n");
if (params.interactive) {
@@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
}
}
- // end of text token
- if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+ // end of generation
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
break;
}
diff --git a/examples/json-schema-to-grammar.py b/examples/json_schema_to_grammar.py
similarity index 73%
rename from examples/json-schema-to-grammar.py
rename to examples/json_schema_to_grammar.py
index 91dd734cc..826cd3f72 100755
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,37 +6,94 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+ if not separator_rule:
+ if min_items == 0 and max_items == 1:
+ return f'{item_rule}?'
+ elif min_items == 1 and max_items is None:
+ return f'{item_rule}+'
+
+ result = ''
+
+ if min_items > 0:
+ if item_rule_is_literal and separator_rule is None:
+ result = '"' + (item_rule[1:-1] * min_items) + '"'
+ else:
+ result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+
+ def opt_repetitions(up_to_n, prefix_with_sep=False):
+ '''
+ - n=4, no sep: '(a (a (a (a)?)?)?)?'
+ - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
+ - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+ '''
+
+ content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
+ if up_to_n == 0:
+ return ''
+ elif up_to_n == 1:
+ return f'({content})?'
+ elif separator_rule and not prefix_with_sep:
+ return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
+ else:
+ return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
+
+ if min_items > 0 and max_items != min_items:
+ result += ' '
+
+ if max_items is not None:
+ result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+ else:
+ item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+
+ if min_items == 0 and separator_rule:
+ result = f'({item_rule} {item_operator}*)?'
+ else:
+ result += f'{item_operator}*'
+
+ return result
+
+
+class BuiltinRule:
+ def __init__(self, content: str, deps: list = None):
+ self.content = content
+ self.deps = deps or []
+
+_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
+
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
- 'boolean': '("true" | "false") space',
- 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
- 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
- 'value' : 'object | array | string | number | boolean',
- 'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
- 'array' : '"[" space ( value ("," space value)* )? "]" space',
- 'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
- 'string': r''' "\"" (
- [^"\\] |
- "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
- )* "\"" space''',
- 'null': '"null" space',
+ 'boolean' : BuiltinRule('("true" | "false") space', []),
+ 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+ 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+ 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+ 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+ 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+ 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+ 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+ 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+ 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+ 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+ 'null' : BuiltinRule('"null" space', []),
}
-OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats
-DATE_RULES = {
- 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
- 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
- 'date-time': 'date "T" time',
- 'date-string': '"\\"" date "\\"" space',
- 'time-string': '"\\"" time "\\"" space',
- 'date-time-string': '"\\"" date-time "\\"" space',
+STRING_FORMAT_RULES = {
+ 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
+ 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
+ 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
+ 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
}
-RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
+
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
-DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
-TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -55,7 +110,9 @@ class SchemaConverter:
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
- self._rules = {'space': SPACE_RULE}
+ self._rules = {
+ 'space': SPACE_RULE,
+ }
self._refs = {}
self._refs_being_resolved = set()
@@ -65,6 +122,29 @@ class SchemaConverter:
)
return f'"{escaped}"'
+ def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+ '''
+ not_literal('a') -> '[^a]'
+ not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+ '''
+ assert len(literal) > 0, 'Empty literal not supported'
+ def recurse(i: int):
+ c = literal[i]
+ if maybe_escaped_underscores and c == '_':
+ yield f'[^{c}\\\\]'
+ yield ' | '
+ yield f'"\\\\"? "{c}"'
+ else:
+ yield f'[^{c}]'
+ if i < len(literal) - 1:
+ yield ' | '
+ yield self._format_literal(c)
+ yield ' ('
+ yield from recurse(i + 1)
+ yield ')?'
+
+ return ''.join(('(', *recurse(0), ')'))
+
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -169,10 +249,10 @@ class SchemaConverter:
def get_dot():
if self._dotall:
- rule = '[\\U00000000-\\U0010FFFF]'
+ rule = DOTALL
else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
- rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+ rule = DOT
return self._add_rule(f'dot', rule)
def join_seq():
@@ -246,26 +326,14 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1]
- if min_times == 0 and max_times is None:
- seq[-1] = (f'{sub}*', False)
- elif min_times == 0 and max_times == 1:
- seq[-1] = (f'{sub}?', False)
- elif min_times == 1 and max_times is None:
- seq[-1] = (f'{sub}+', False)
- else:
- if not sub_is_literal:
- id = sub_rule_ids.get(sub)
- if id is None:
- id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
- sub_rule_ids[sub] = id
- sub = id
+ if not sub_is_literal:
+ id = sub_rule_ids.get(sub)
+ if id is None:
+ id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+ sub_rule_ids[sub] = id
+ sub = id
- seq[-1] = (
- ' '.join(
- ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
- ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
- False
- )
+ seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
else:
literal = ''
while i < length:
@@ -373,49 +441,47 @@ class SchemaConverter:
' "]" space')
else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
- list_item_operator = f'( "," space {item_rule_name} )'
- successive_items = ""
min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems")
- if min_items > 0:
- successive_items = list_item_operator * (min_items - 1)
- min_items -= 1
- if max_items is not None and max_items > min_items:
- successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
- else:
- successive_items += list_item_operator + "*"
- if min_items == 0:
- rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
- else:
- rule = f'"[" space {item_rule_name} {successive_items} "]" space'
- return self._add_rule(rule_name, rule)
+ return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
- return self._add_rule(
+ return self._add_primitive(
'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid']
)
- elif schema_type in (None, 'string') and schema_format in DATE_RULES:
- for t, r in DATE_RULES.items():
- self._add_rule(t, r)
- return schema_format + '-string'
+ elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+ prim_name = f'{schema_format}-string'
+ return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+ elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+ char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+ min_len = schema.get('minLength', 0)
+ max_len = schema.get('maxLength')
+
+ return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif (schema_type == 'object') or (len(schema) == 0):
- for n in OBJECT_RULE_NAMES:
- self._add_rule(n, PRIMITIVE_RULES[n])
- return self._add_rule(rule_name, 'object')
+ return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
- return self._add_rule(
- 'root' if rule_name == 'root' else schema_type,
- PRIMITIVE_RULES[schema_type]
- )
+ return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
+
+ def _add_primitive(self, name: str, rule: BuiltinRule):
+ n = self._add_rule(name, rule.content)
+
+ for dep in rule.deps:
+ dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+ assert dep_rule, f'Rule {dep} not known'
+ if dep not in self._rules:
+ self._add_primitive(dep, dep_rule)
+ return n
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order
@@ -437,7 +503,7 @@ class SchemaConverter:
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
- self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+ self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
)
optional_props.append("*")
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 27e113203..8b532c8b6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -190,7 +190,7 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
- /* n_threads */ {get_num_physical_cores()},
+ /* n_threads */ {get_math_cpu_count()},
/* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
index ce8ab3b70..4af9de303 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
- if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF("");
}
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index c249291ae..737f882fb 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
- if new_token_id == llama_token_eos(model) || n_cur == n_len {
+ if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
@@ -322,7 +322,7 @@ actor LlamaContext {
defer {
result.deallocate()
}
- let nTokens = llama_token_to_piece(model, token, result, 8)
+ let nTokens = llama_token_to_piece(model, token, result, 8, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
defer {
newResult.deallocate()
}
- let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+ let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 96b048525..413e433dd 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## Model conversion
-- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 67cb0f22b..d4810d42e 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## LLaVA 1.5
-- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 5954bf6cd..e3c9bcd43 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
+#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
@@ -23,7 +24,6 @@
#include
#include
#include
-#include
#include