Merge pull request #15 from OpenBMB/master

sync master
This commit is contained in:
tc-mb 2024-06-24 11:29:30 +08:00 committed by GitHub
commit cb8cfb9d4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
440 changed files with 70366 additions and 64817 deletions

View File

@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements

View File

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements

View File

@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
RUN make -j$(nproc)
RUN make -j$(nproc) llama-cli
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/main /main
RUN apt-get update && \
apt-get install -y libgomp1
ENTRYPOINT [ "/main" ]
COPY --from=build /app/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View File

@ -0,0 +1,26 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View File

@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc)
RUN make -j$(nproc) llama-cli
ENTRYPOINT [ "/app/main" ]
ENTRYPOINT [ "/app/llama-cli" ]

View File

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
WORKDIR /app
COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 && \
cmake --build build --config Release --target main
cmake --build build --config Release --target llama-cli
# Clean up
WORKDIR /
RUN cp /app/build/bin/main /main && \
RUN cp /app/build/bin/llama-cli /llama-cli && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]
ENTRYPOINT [ "/llama-cli" ]

View File

@ -9,12 +9,15 @@ WORKDIR /app
COPY . .
RUN make -j$(nproc)
RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]
ENTRYPOINT [ "/llama-cli" ]

View File

@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamaclblast
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamaclblast
%{_bindir}/llamaclblastserver
%{_bindir}/llamaclblastsimple
%{_bindir}/llama-clblast-cli
%{_bindir}/llama-clblast-server
%{_bindir}/llama-clblast-simple
/usr/lib/systemd/system/llamaclblast.service
%config /etc/sysconfig/llama

View File

@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcuda
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacppcuda
%{_bindir}/llamacppcudaserver
%{_bindir}/llamacppcudasimple
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
%config /etc/sysconfig/llama

View File

@ -38,9 +38,9 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llama
cp -p server %{buildroot}%{_bindir}/llamaserver
cp -p simple %{buildroot}%{_bindir}/llamasimple
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
@ -69,9 +69,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llama
%{_bindir}/llamaserver
%{_bindir}/llamasimple
%{_bindir}/llama-cli
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
%config /etc/sysconfig/llama

View File

@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
COPY --from=build /app/llama-server /llama-server
ENTRYPOINT [ "/server" ]
ENTRYPOINT [ "/llama-server" ]

View File

@ -0,0 +1,29 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-server" ]

View File

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc)
RUN make -j$(nproc) llama-server
ENTRYPOINT [ "/app/server" ]
ENTRYPOINT [ "/app/llama-server" ]

View File

@ -19,13 +19,13 @@ RUN apt-get update && \
WORKDIR /app
COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target server
cmake --build build --config Release --target llama-server
# Clean up
WORKDIR /
RUN cp /app/build/bin/server /server && \
RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]
ENTRYPOINT [ "/llama-server" ]

View File

@ -11,15 +11,15 @@ COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc)
RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]
ENTRYPOINT [ "/llama-server" ]

View File

@ -1,34 +0,0 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@ -6,11 +6,11 @@
let
inherit (config.packages) default;
binaries = [
"llama"
"llama-cli"
"llama-embedding"
"llama-server"
"quantize"
"train-text-from-scratch"
"llama-quantize"
"llama-train-text-from-scratch"
];
mkApp = name: {
type = "app";

View File

@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet.
postInstall = ''
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
mkdir -p $out/include
cp $src/llama.h $out/include/
'';
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe`
mainProgram = "llama";
mainProgram = "llama-cli";
# These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.

View File

@ -1,45 +0,0 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View File

@ -10,11 +10,11 @@ shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$@"
./llama-cli "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./finetune "$@"
./llama-finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" q4_0
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$@"
./llama-server "$@"
else
echo "Unknown command: $arg1"
echo "Available commands: "

View File

@ -12,8 +12,8 @@ build*/
models/*
/main
/quantize
/llama-cli
/llama-quantize
arm_neon.h
compile_commands.json

View File

@ -26,3 +26,7 @@ indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
[examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset
insert_final_newline = unset

View File

@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

View File

@ -24,7 +24,7 @@ body:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:

1
.github/labeler.yml vendored
View File

@ -42,7 +42,6 @@ build:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**

7
.github/pull_request_template.md vendored Normal file
View File

@ -0,0 +1,7 @@
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
- [ ] Low
- [ ] Medium
- [ ] High

View File

@ -119,7 +119,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target server
cmake --build build --config Release -j $(nproc) --target llama-server
- name: Download the dataset
id: download_dataset

View File

@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -84,7 +84,7 @@ jobs:
name: llama-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-latest
runs-on: macos-12
steps:
- name: Clone
@ -103,12 +103,10 @@ jobs:
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
@ -241,8 +239,8 @@ jobs:
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name
id: tag
@ -684,12 +682,10 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
runs-on: windows-2019
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
@ -706,8 +702,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
@ -732,27 +726,6 @@ jobs:
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@ -786,13 +759,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast-x64' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
@ -816,7 +782,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@ -861,7 +827,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda:
runs-on: windows-latest
runs-on: windows-2019
strategy:
matrix:
@ -875,8 +841,9 @@ jobs:
with:
fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.11
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: ${{ matrix.cuda }}
method: 'network'
@ -1071,7 +1038,7 @@ jobs:
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release:

View File

@ -1,40 +0,0 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

View File

@ -30,20 +30,20 @@ jobs:
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4

View File

@ -16,11 +16,9 @@ on:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target:
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@ -32,7 +30,7 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
@ -89,8 +87,22 @@ jobs:
exit 1
fi
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DLLAMA_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_NATIVE=OFF \
@ -98,7 +110,7 @@ jobs:
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
@ -115,7 +127,7 @@ jobs:
server-windows:
runs-on: windows-latest
runs-on: windows-2019
steps:
- name: Clone
@ -138,7 +150,7 @@ jobs:
id: cmake_build
run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
id: setup_python

155
.gitignore vendored
View File

@ -1,129 +1,122 @@
*.o
# Extensions
*.a
*.so
*.bat
*.bin
*.dll
*.dot
*.etag
*.exe
*.gcda
*.gcno
*.gcov
*.gguf
*.gguf.json
*.bin
*.exe
*.dll
*.log
*.gcov
*.gcno
*.gcda
*.dot
*.bat
*.tmp
*.metallib
*.etag
*.lastModified
.DS_Store
.build/
*.log
*.metallib
*.o
*.so
*.tmp
# IDE / OS
.cache/
.ccls-cache/
.direnv/
.DS_Store
.envrc
.idea/
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
.idea/
nppBackup
ggml-metal-embed.metal
lcov-report/
# Coverage
gcovr-report/
lcov-report/
# Build Artifacts
tags
.build/
build*
!build-info.cmake
!build-info.cpp.in
!build-info.sh
!build.zig
/libllama.so
/llama-*
android-ndk-*
arm_neon.h
cmake-build-*
CMakeSettings.json
compile_commands.json
ggml-metal-embed.metal
llama-batched-swift
out/
tmp/
# CI
!.github/workflows/*.yml
# Models
models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
/Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
/gritlm
/imatrix
/infill
/libllama.so
/llama-bench
/llava-cli
/minicpmv-cli
/openbmb
/lookahead
/lookup
/lookup-create
/lookup-merge
/lookup-stats
/main
/metal
/passkey
/perplexity
/q8dot
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/batched
/batched-bench
/export-lora
/finetune
/retrieval
/speculative
/parallel
/train-text-from-scratch
/tokenize
/vdot
/common/build-info.cpp
arm_neon.h
compile_commands.json
CMakeSettings.json
__pycache__
dist
# Zig
zig-out/
zig-cache/
# Logs
ppl-*.txt
qnt-*.txt
perf-*.txt
# Examples
examples/jeopardy/results.txt
examples/server/*.css.hpp
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
examples/server/*.css.hpp
!build_64.sh
!examples/*.bat
!examples/*/*.kts
!examples/*/*/*.kts
!examples/sycl/*.bat
!examples/sycl/*.sh
# Python
__pycache__
.venv
/Pipfile
dist
poetry.lock
poetry.toml
nppBackup
# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-backend-ops
/tests/test-double-float
/tests/test-grad0
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-rope
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-spm
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops
/tests/test-tokenizer-1-spm
# Scripts
!/scripts/install-oneapi.bat

View File

@ -39,8 +39,12 @@ endif()
if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
set(LLAMA_BLAS_DEFAULT ON)
set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
else()
set(LLAMA_METAL_DEFAULT OFF)
set(LLAMA_BLAS_DEFAULT OFF)
set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
endif()
set(LLAMA_LLAMAFILE_DEFAULT ON)
@ -91,9 +95,10 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT})
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
"llama: BLAS library vendor")
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
@ -111,10 +116,10 @@ option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for Flas
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
option(LLAMA_VULKAN_MEMORY_DEBUG "llama: enable Vulkan memory debug output" OFF)
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
@ -139,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -312,9 +314,9 @@ if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8)
endif()
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
# set(BLA_SIZEOF_INTEGER 8)
#endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS)
@ -322,7 +324,7 @@ if (LLAMA_BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
@ -375,12 +377,15 @@ if (LLAMA_BLAS)
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
add_compile_definitions(GGML_USE_BLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
set(GGML_HEADERS_BLAS ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.cpp)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else()
@ -403,12 +408,26 @@ if (LLAMA_CUBLAS)
endif()
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h)
@ -417,6 +436,8 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@ -471,21 +492,6 @@ if (LLAMA_CUDA)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "CUDA not found")
endif()
@ -502,22 +508,6 @@ if (LLAMA_RPC)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
@ -542,6 +532,10 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_VULKAN_DEBUG)
endif()
if (LLAMA_VULKAN_MEMORY_DEBUG)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()
if (LLAMA_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()
@ -557,12 +551,17 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
if (NOT EXISTS $ENV{ROCM_PATH})
if (NOT EXISTS /opt/rocm)
set(ROCM_PATH /usr)
else()
set(ROCM_PATH /opt/rocm)
endif()
else()
set(ROCM_PATH /opt/rocm)
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
@ -600,6 +599,8 @@ if (LLAMA_HIPBLAS)
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@ -661,6 +662,7 @@ if (LLAMA_SYCL)
#todo: AOT
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
message(STATUS "SYCL found")
@ -675,21 +677,22 @@ if (LLAMA_SYCL)
endif()
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
endif()
set(GGML_HEADERS_SYCL ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
@ -864,10 +867,6 @@ if (LLAMA_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
if (LLAMA_PERF)
add_compile_definitions(GGML_PERF)
endif()
function(get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
@ -1260,7 +1259,6 @@ add_library(ggml OBJECT
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@ -1268,6 +1266,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
)
@ -1348,8 +1347,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)

View File

@ -11,9 +11,21 @@
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{
"name": "sycl-base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"LLAMA_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
@ -35,15 +47,18 @@
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
]
}

14
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,14 @@
# Contributing Guidelines
## Checklist
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
* Execute [the full CI locally on your machine](ci/README.md) before publishing
## PR formatting
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`

200
Makefile
View File

@ -1,8 +1,46 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli minicpmv-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
libllava.a \
llama-baby-llama \
llama-batched \
llama-batched-bench \
llama-bench \
llama-benchmark-matmult \
llama-cli \
llama-convert-llama2c-to-ggml \
llama-embedding \
llama-eval-callback \
llama-export-lora \
llama-finetune \
llama-gbnf-validator \
llama-gguf \
llama-gguf-split \
llama-gritlm \
llama-imatrix \
llama-infill \
llama-llava-cli \
llama-minicpmv-cli\
llama-lookahead \
llama-lookup \
llama-lookup-create \
llama-lookup-merge \
llama-lookup-stats \
llama-parallel \
llama-passkey \
llama-perplexity \
llama-q8dot \
llama-quantize \
llama-quantize-stats \
llama-retrieval \
llama-save-load-state \
llama-server \
llama-simple \
llama-speculative \
llama-tokenize \
llama-train-text-from-scratch \
llama-vdot \
llama-cvector-generator \
tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
@ -307,9 +345,6 @@ ifdef LLAMA_GPROF
MK_CFLAGS += -pg
MK_CXXFLAGS += -pg
endif
ifdef LLAMA_PERF
MK_CPPFLAGS += -DGGML_PERF
endif
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
@ -404,10 +439,11 @@ ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
MK_LDFLAGS += -framework Accelerate
OBJS += ggml-blas.o
endif
endif # LLAMA_NO_ACCELERATE
@ -418,21 +454,30 @@ ifndef LLAMA_NO_OPENMP
endif # LLAMA_NO_OPENMP
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
OBJS += ggml-blas.o
endif # LLAMA_OPENBLAS
ifdef LLAMA_OPENBLAS64
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
OBJS += ggml-blas.o
endif # LLAMA_OPENBLAS64
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
OBJS += ggml-blas.o
endif # LLAMA_BLIS
ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o
endif
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJS += ggml-rpc.o
@ -444,6 +489,7 @@ ifdef LLAMA_CUBLAS
endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
@ -459,7 +505,7 @@ ifdef LLAMA_CUDA
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
@ -547,23 +593,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
$(NVCC_COMPILE)
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
MK_LDFLAGS += -lclblast -framework OpenCL
else
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
@ -577,6 +606,10 @@ ifdef LLAMA_VULKAN_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
endif
ifdef LLAMA_VULKAN_MEMORY_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
endif
ifdef LLAMA_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif
@ -756,6 +789,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-blas.o: ggml-blas.cpp ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@
unicode.o: unicode.cpp unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@
@ -793,7 +829,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o
rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete
@ -809,62 +845,62 @@ clean:
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo '==== Run ./llama-cli -h for help. ===='
@echo
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -877,23 +913,27 @@ examples/server/%.hpp: examples/server/public/% Makefile
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -904,66 +944,68 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) -c examples/llava/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -989,20 +1031,20 @@ build-info.o: common/build-info.cpp
tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult
run-benchmark-matmult: llama-benchmark-matmult
./$@
.PHONY: run-benchmark-matmult swift
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1014,7 +1056,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@ -1,6 +1,7 @@
# llama.cpp for SYCL
- [Background](#background)
- [Recommended Release](#recommended-release)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
@ -29,10 +30,25 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## Recommended Release
The SYCL backend would be broken by some PRs due to no online CI.
The following release is verified with good quality:
|Commit ID|Tag|Release|Verified Platform|
|-|-|-|-|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
## News
- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully.
- 2024.4
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
@ -77,7 +93,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
@ -99,14 +115,14 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
```
*Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container
@ -275,7 +291,7 @@ source /opt/intel/oneapi/setvars.sh
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh
./build/bin/ls-sycl-device
./build/bin/llama-ls-sycl-device
```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
```
@ -313,7 +329,7 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
or run by script:
@ -324,7 +340,7 @@ or run by script:
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
Otherwise, you can run the script:
@ -394,15 +410,9 @@ Output (example):
4. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
b. Download & install mingw-w64 make for Windows provided by w64devkit
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
### II. Build llama.cpp
@ -412,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 2: Or FP16
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
cmake --build build --config Release -j
```
@ -425,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
.\examples\sycl\win-build-sycl.bat
```
Or, use CMake presets to build:
```sh
cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
```
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:*
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
### III. Run the inference
@ -488,13 +512,13 @@ Examples:
- Use device 0:
```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
```
- Use multiple devices:
```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
Otherwise, run the following wrapper script:

224
README.md
View File

@ -10,6 +10,9 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
### Recent API changes
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -53,7 +56,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@ -77,7 +79,7 @@ variety of hardware - locally and in the cloud.
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@ -193,6 +195,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
@ -206,6 +209,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@ -218,7 +222,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
Here is a typical run using LLaMA v2 13B on M2 Ultra:
```
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
@ -364,17 +368,6 @@ In order to build llama.cpp you have four different options.
cmake --build build --config Debug
```
- Using `Zig` (version 0.11 or later):
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
it's also possible to cross compile for other operating systems and architectures:
```bash
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
```
The `zig targets` command will give you valid options to use.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -382,16 +375,11 @@ In order to build llama.cpp you have four different options.
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
opencl clblast openblas
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
@ -400,6 +388,30 @@ brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Nix
On Mac and Linux, the Nix package manager can be used via
```
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
```
nix-env --file '<nixpkgs>' --install --attr llama-cpp
```
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
#### Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.
### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@ -410,7 +422,7 @@ argument.
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
@ -564,111 +576,6 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details>
<summary>Installing the OpenCL SDK from source</summary>
```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
cd OpenCL-SDK
cmake -B build -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build build
cmake --install build --prefix /some/path
```
</details>
##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```
Alternatively, they may be built from source.
- <details>
<summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/CLBlast
```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details>
<summary>Unix:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build build --config Release
cmake --install build --prefix /some/path
```
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details>
##### Building Llama with CLBlast
- Build with make:
```sh
make LLAMA_CLBLAST=1
```
- CMake (Unix):
```sh
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build build --config Release
```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
The selection can be a number (starting from 0) or a text string to search:
```sh
GGML_OPENCL_PLATFORM=1 ./main ...
GGML_OPENCL_DEVICE=2 ./main ...
GGML_OPENCL_PLATFORM=Intel ./main ...
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
```
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
Using the variables it is possible to select a CPU-based driver as well, if so desired.
You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### Vulkan
**With docker**:
@ -677,7 +584,7 @@ Building the program with BLAS support may lead to some performance improvements
```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
@ -698,7 +605,9 @@ Building the program with BLAS support may lead to some performance improvements
vulkaninfo
```
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
@ -706,7 +615,7 @@ Building the program with BLAS support may lead to some performance improvements
cmake -B build -DLLAMA_VULKAN=1
cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@ -719,7 +628,7 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
@ -739,21 +648,18 @@ python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current version if older version is now unsupported
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
```
### Run the quantized model
```bash
# start inference on a gguf model
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -828,7 +734,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
perplexity : calculating perplexity over 655 chunks
@ -852,16 +758,16 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh
# custom arguments using a 13B model
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
@ -883,41 +789,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruct mode
1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this:
```
./examples/alpaca.sh
```
Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
> How many letters are there in the English alphabet?
There 26 letters in the English Alphabet
> What is the most common way of transportation in Amsterdam?
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
@ -990,7 +868,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here's a demo of an interactive session running on Pixel 5 phone:
@ -1057,8 +935,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -1108,7 +986,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
### Docs
- [main](./examples/main/README.md)
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)

224
ci/run.sh
View File

@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}

View File

@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View File

@ -1,14 +0,0 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View File

@ -84,4 +84,4 @@ endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

File diff suppressed because it is too large Load Diff

View File

@ -56,66 +56,66 @@ struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string model_url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
// TODO: avoid tuple, use struct
@ -124,37 +124,36 @@ struct gpt_params {
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL divergence
bool kl_divergence = false; // compute KL divergence
bool random_prompt = false; // do not randomize prompt if none provided
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool interactive_first = false; // wait for user input immediately
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
@ -162,7 +161,6 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
@ -180,6 +178,68 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
std::string hostname = "127.0.0.1";
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
std::vector<std::string> api_keys;
std::string ssl_file_key = "";
std::string ssl_file_cert = "";
bool endpoint_slots = true;
bool endpoint_metrics = false;
bool log_json = false;
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params
bool is_pp_shared = false;
std::vector<int32_t> n_pp;
std::vector<int32_t> n_tg;
std::vector<int32_t> n_pl;
// retrieval params
std::vector<std::string> context_files; // context files to embed
int32_t chunk_size = 64; // chunk size for context embedding
std::string chunk_separator = "\n"; // chunk separator for context embedding
// passkey params
int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
int n_completions = 64;
int n_pca_batch = 20;
int n_pca_iterations = 1000;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
};
void gpt_params_handle_model_default(gpt_params & params);
@ -199,7 +259,20 @@ std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@ -212,6 +285,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
//
// Model utils
@ -282,6 +356,13 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// Chat template utils
//
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
//
// KV cache utils
//

View File

@ -46,8 +46,12 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
static bool is_digit_char(char c) {
return '0' <= c && c <= '9';
}
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@ -99,6 +103,17 @@ namespace grammar_parser {
return pos;
}
static const char * parse_int(const char * src) {
const char * pos = src;
while (is_digit_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting integer at ") + src);
}
return pos;
}
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
@ -137,6 +152,60 @@ namespace grammar_parser {
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
if (min_times == 0) {
out_elements.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
std::vector<llama_grammar_element> rec_rule(previous_elements);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(previous_elements.size());
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
@ -197,40 +266,51 @@ namespace grammar_parser {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
} else if (*pos == '.') { // any char
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
}
@ -325,6 +405,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
@ -339,6 +420,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
@ -350,6 +432,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
@ -407,11 +490,15 @@ namespace grammar_parser {
}
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, ".");
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");

View File

@ -16,92 +16,55 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
}
if (separator_rule.empty()) {
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
if (min_items == 1 && !has_max) {
return item_rule + "+";
}
}
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else if (min_items == 0 && !has_max) {
return item_rule + "*";
} else {
std::vector<std::string> items(min_items, item_rule);
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
}
}
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
}
};
if (min_items > 0 && max_items != min_items) {
result += " ";
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
if (min_items == 0) {
result = "(" + result + ")?";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result;
}
const std::string SPACE_RULE = "\" \"?";
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
struct BuiltinRule {
std::string content;
std::vector<std::string> deps;
};
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
{"decimal-part", {"[0-9]{1,16}", {}}},
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@ -385,8 +348,7 @@ private:
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
"",
sub_is_literal
""
);
seq.back().second = false;
} else {

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@ -82,6 +83,8 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
]
@ -211,7 +214,7 @@ src_func = f"""
"""
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
@ -219,7 +222,7 @@ convert_py = re.sub(
flags=re.DOTALL | re.MULTILINE,
)
convert_py_pth.write_text(convert_py)
convert_py_pth.write_text(convert_py, encoding="utf-8")
logger.info("+++ convert-hf-to-gguf.py was updated")

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
@ -46,11 +47,12 @@ class Model:
_model_classes: dict[str, type[Model]] = {}
dir_model: Path
ftype: int
ftype: gguf.LlamaFileType
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
lazy: bool
model_name: str | None
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
@ -63,7 +65,7 @@ class Model:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model
@ -72,10 +74,11 @@ class Model:
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.lazy = not eager
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -93,7 +96,7 @@ class Model:
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod
def __init_subclass__(cls):
@ -181,7 +184,7 @@ class Model:
return new_name
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count)
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -323,21 +326,21 @@ class Model:
def write(self):
self.write_tensors()
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()
def write_vocab(self):
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@staticmethod
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
if filename.endswith(suffix):
if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename)
part_names.sort()
@ -474,6 +477,12 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
res = "poro-chat"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if res is None:
logger.warning("\n")
@ -661,7 +670,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -794,7 +803,7 @@ class MPTModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count)
@ -846,7 +855,7 @@ class OrionModel(Model):
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -883,7 +892,7 @@ class BaichuanModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -958,7 +967,11 @@ class XverseModel(Model):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
# because vocab_size is the count of items, and indexes start at 0.
max_vocab_index = max(tokenizer.get_vocab().values())
if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.")
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
@ -1006,7 +1019,7 @@ class XverseModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -1202,7 +1215,7 @@ class StableLMModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -1391,6 +1404,48 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("BitnetForCausalLM")
class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
gguf.MODEL_TENSOR.ATTN_Q,
gguf.MODEL_TENSOR.ATTN_K,
gguf.MODEL_TENSOR.ATTN_V,
gguf.MODEL_TENSOR.ATTN_OUT,
gguf.MODEL_TENSOR.FFN_UP,
gguf.MODEL_TENSOR.FFN_DOWN,
gguf.MODEL_TENSOR.FFN_GATE,
]):
# transform weight into 1/0/-1 (in fp32)
weight_torch, scale_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
else:
yield (new_name, data_torch)
@Model.register("GrokForCausalLM")
class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
@ -1623,6 +1678,12 @@ class Qwen2MoeModel(Model):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
_experts: list[dict[str, Tensor]] | None = None
@ -1677,7 +1738,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2244,7 +2305,7 @@ class GemmaModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -2344,7 +2405,7 @@ class MambaModel(Model):
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2451,11 +2512,13 @@ class JinaBertV2Model(BertModel):
def get_tensors(self):
for name, data in super().get_tensors():
if 'gated_layers' in name:
if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue
@ -2846,7 +2909,7 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
logger.info("Set model parameters")
model_instance.set_gguf_parameters()

View File

@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
## GGUF specification

View File

@ -3,7 +3,7 @@
## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -27,7 +27,7 @@ RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result:

View File

@ -12,45 +12,46 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(cvector-generator)
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(batched)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(minicpmv)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(retrieval)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(passkey)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(gguf)
add_subdirectory(train-text-from-scratch)
add_subdirectory(imatrix)
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(train-text-from-scratch)
endif()

View File

@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
./llama-cli "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \

View File

@ -1,19 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 7

View File

@ -1,4 +1,4 @@
set(TARGET baby-llama)
set(TARGET llama-baby-llama)
add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
// store key and value to memory
{
@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wqb,
cur)),
n_embd/n_head, n_head, N),
KQ_pos, n_rot, 0, 0);
KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0,
@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wkb,
cur)),
n_embd/n_head, n_head, N),
KQ_pos, n_rot, 0, 0);
KQ_pos, n_rot, 0);
// store key and value to memory
{

View File

@ -58,4 +58,4 @@ echo "$2
model=$1
# generate the most likely continuation until the string "===" is found
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View File

@ -1,4 +1,4 @@
set(TARGET batched-bench)
set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results

View File

@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0;
int n_gpu_layers = 0;
int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
n_kv_max = std::atoi(argv[2]);
}
if (argc >= 4) {
n_batch = std::atoi(argv[3]);
}
if (argc >= 5) {
n_ubatch = std::atoi(argv[4]);
}
if (argc >= 6) {
flash_attn = std::atoi(argv[5]);
}
if (argc >= 7) {
is_pp_shared = std::atoi(argv[6]);
}
if (argc >= 8) {
n_gpu_layers = std::atoi(argv[7]);
}
if (argc >= 9) {
n_pp = parse_list(argv[8]);
}
if (argc >= 10) {
n_tg = parse_list(argv[9]);
}
if (argc >= 11) {
n_pl = parse_list(argv[10]);
}
std::vector<int> n_pp = params.n_pp;
std::vector<int> n_tg = params.n_tg;
std::vector<int> n_pl = params.n_pl;
// init LLM
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
// initialize the model
llama_model_params model_params = llama_model_default_params();
const std::vector<float> t_split(llama_max_devices(), 0.0f);
model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
return 1;
}
const int32_t n_kv_max = llama_n_ctx(ctx);
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View File

@ -1,6 +1,6 @@
.PHONY: build
build:
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./batched_swift
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./llama-batched-swift
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift

View File

@ -4,7 +4,7 @@
import PackageDescription
let package = Package(
name: "batched_swift",
name: "llama-batched-swift",
platforms: [.macOS(.v12)],
dependencies: [
.package(name: "llama", path: "../../"),
@ -13,7 +13,7 @@ let package = Package(
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(
name: "batched_swift",
name: "llama-batched-swift",
dependencies: ["llama"],
path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]

View File

@ -1,4 +1,4 @@
This is a swift clone of `examples/batched`.
$ `make`
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`

View File

@ -1,4 +1,4 @@
set(TARGET batched)
set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt
```bash
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...

View File

@ -7,48 +7,31 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
return 1 ;
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
// number of parallel batches
int n_parallel = 1;
int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
params.prompt = argv[2];
}
if (argc >= 4) {
n_parallel = std::atoi(argv[3]);
}
if (argc >= 5) {
n_len = std::atoi(argv[4]);
}
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
string_process_escapes(params.prompt);
int n_predict = 32;
// init LLM
@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = n_gpu_layers;
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context
llama_context_params ctx_params = llama_context_default_params();
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
return 1;
}
const int n_ctx = llama_n_ctx(ctx);
const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
while (n_cur <= n_predict) {
// prepare the next batch
llama_batch_clear(batch);
@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {

View File

@ -1,188 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;
int n_past = 0;
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}

View File

@ -1,4 +1,4 @@
set(TARGET benchmark)
set(TARGET llama-bench-matmult)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})

View File

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./main $GEN_OPTIONS \
./llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View File

@ -62,7 +62,7 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \
./llama-cli 2>>"$LOG" \
--batch_size 64 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./main 2>>"$LOG" "${OPTS[@]}" \
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./main
skip_bytes 1 | # skip BOS token added by ./llama-cli
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
@ -133,7 +133,7 @@ while read -e line; do
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!"
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1
fi
@ -144,7 +144,7 @@ while read -e line; do
fi
# Update cache for next prompt in background, ideally during user input
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &

View File

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
./bin/llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View File

@ -11,6 +11,6 @@ cd ..
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View File

@ -176,7 +176,7 @@ class Params:
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
n_orig_ctx: int | None = None
n_ctx_orig: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None
@ -226,7 +226,7 @@ class Params:
with open(config_path) as f:
config = json.load(f)
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
if rope_scaling is not None and (typ := rope_scaling.get("type")):
@ -236,7 +236,7 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
n_ctx_orig = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@ -272,7 +272,7 @@ class Params:
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
n_ctx_orig = n_ctx_orig,
rope_finetuned = rope_finetuned,
)
@ -864,8 +864,8 @@ class OutputFile:
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
if params.n_orig_ctx is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
if params.n_ctx_orig is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)

View File

@ -1,4 +1,4 @@
set(TARGET convert-llama2c-to-ggml)
set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
After successful compilation, following usage options are available:
```
usage: ./convert-llama2c-to-ggml [options]
usage: ./llama-convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
@ -19,10 +19,10 @@ options:
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View File

@ -1,5 +1,5 @@
set(TARGET beam-search)
add_executable(${TARGET} beam-search.cpp)
set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -0,0 +1,34 @@
# cvector-generator
This example demonstrates how to generate a control vector using gguf models.
Related PRs:
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
## Examples
```sh
# CPU only
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
# With GPU
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
# With advanced options
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
# To see help message
./cvector-generator -h
# Then, have a look at "cvector" section
```
## Tips and tricks
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
```
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
<|im_start|>system\nYou are in a very good mood today<|im_end|>
```

View File

@ -0,0 +1,582 @@
That game
I can see
Hmm, this
I can relate to
Who is
I understand the
Ugh,
What the hell was
Hey, did anyone
Although
Thank you for choosing
What are you
Oh w
How dare you open
It was my pleasure
I'm hon
I appreciate that you
Are you k
Whoever left this
It's always
Ew,
Hey, I l
Hello? Is someone
I understand that
That poem
Aww, poor
Hey, it
Alright, who
I didn't
Well, life
The document
Oh no, this
I'm concerned
Hello, this is
This art
Hmm, this drink
Hi there!
It seems
Is
Good
I can't
Ex
Who are
I can see that
Wow,
Today is a
Hey friend
Sometimes friends
Oh, this old
The weather outside
This place is sur
I appreciate your input
Thank you for the
Look at
I'm disappoint
To my
How dare you
That's an
This piece of art
Eww
This park is
This is incredible
Oh no, someone
Exc
Well, it'
I warned
Hey, I understand
Hey, I saw
How dare you go
What the he
Hey
It's
Hello? Hello?
It
Oh no!
This is the perfect
Good morning,
Oh no, there
It's so
Yeah
Uh,
Hello everyone
Who turned off
The weather
Who'
Hey, this
Wait,
Eww, gross
Excuse
It seems like you
Thank you so
What happened?
Oh my g
I am deeply sad
I war
Okay, let'
Hey, that
That was a beautiful
Oh no! That
What happened
Hey there
The artist'
What?!
Hey, it'
I am disappoint
It seems like
Oh no! The
This park is a
If you
Yes! I did
It sounds
What
Who is it
Hmm, that
That's strange
Yeah, that was
That's interesting
This park
What the hell
Who is that
I feel like my
Oh well
What the hell is
Hello? Hello
To my dearest
Bless you!\"
Thank you for
Oh, looks like
Can you please
This place is
Eww, what
Bless you
Is everything
Hey, I just
Whoever left these
Well, that'
I feel
Hey, do you
It's sad
Oh no, it
Hey, that'
Oh my god,
Thank you,
Hello little one,
I apolog
Hey team, I
How dare you read
Who is this and
Whoever left
Hi there! W
A
If you have
I was
U
Bless
Well, this
Oh, I'
It's a
Eww,
Is everything okay?
Oh, I
Hello, can you
Al
That was a great
What are
I understand that not
Oh no, not
Who is it?\"
Hey, can we
Whoever is taking
I would love to
Hey, I noticed
Hey, could
I understand that there
Hello?
D
Oh man, I
Thank you so much
Oh no, my
Dear [Name
Uh
I remember
Hey, who
Well, it
Are you
I understand that it
Hey, is
I would
Who is this
Excuse me
Alright
I am thrilled
Sometimes friends have
Who the
It's interesting
I would love
E
Hello? Is anyone
Well, this is
This place
Well,
I warned you
Hey, watch where
Oh my
That'
Sometimes friends have different
I understand that everyone
What?
What do these notes
I can relate
I'm not
I understand
To my dear
Guys
Well
Hey, I appreciate
Wow, what
Dear
That melody
Who the hell
Today is
Hello little
Wow, look
That's great
Love is never wrong
I'm having
Whoa, did
Ugh
Can you please provide
I miss you,
I feel uncom
I know
Ugh, this
Hey, watch
Oh great, a
I didn
Okay
That game of char
Oh
I appreciate
Who's there
I am so
Oh great, someone
Hey, could you
I remember wondering
Wait, what?
What do
Hello? Can
Hey there,
That game of
This is incred
Oh my gosh
Oh great, f
I appreciate your
It sounds like
What the heck
Okay, I understand
Ew
I understand that this
Uh, hi
Hi everyone!
What the hell?
Thank you for your
Oh no, the
Wow, I
Who turned
Dear [
Whoever
This is a
Whoa, he
What in the world
Although the physical
Hello, who is
That's amaz
Hey, I know
Okay, that
Hi everyone
Hey, is everything
I understand your fr
Oh no, poor
Oh, look
Good morning
Ew, gross
Oh no, did
Look at the family
Hey team
Yes!
Hey, can I
Okay, that'
It's great
Love is
Hey, what
Good morning, world
Who is it?
That poem really reson
I
That's
I understand the task
Gu
Hello? Who'
This postcard is
Whoa,
Oh, that
I understand that I
Whoever is
Hello? Who is
I'm really
Wow, this
Can
This artwork really
This is a shame
I miss you too
Who are you?
Today is a difficult
Hey, just
Are you okay
I am
Hi,
Wow, that
Hey there! Can
Okay, stay
Oh great, just
Yeah,
Hello? Can you
Oh, looks
Thank you for sharing
I'm glad
Hey, is that
Hmm
It was my
It sounds like you
Wow, your
I was promised certain
That was such a
Thank
Excuse you
That was
Hey team,
I feel un
It was
What'
Hey friend, I
How
Saying goodbye
That
It's heart
How dare
Oh,
Hello, may
What's this
Thank you for recogn
Aww, that
Oh, I remember
Hmm, that'
I miss
I know this
Wait
Is everything okay
Who is that person
Wow, you
Oh great
I'm sad
Wow, the
I am very disappoint
Who turned off the
I understand that things
I'm very
Hi
That's very
Okay, I
Oh no,
Wow, there
What's wrong
I apologize for
Hey, I
Can I help you
Oh, I didn
Alright,
Oh wow,
Oh my goodness
I know this event
What in the
Saying
Yeah, that
Guys, I
Hey, this v
This post
Are
Hey, can
Hello? Is
I can only imagine
Oh, that sounds
Hey, is anyone
I am disappointed
Hello,
Hey everyone, I
That was such
It's okay
The artist
Whoa
I understand that mistakes
Can I help
Who
Hi everyone! I
Hey, can you
Wow, how
Today
Oh no, I
Oh well, I
Well, that
This is the
Yes! I finally
Hey there little
Hello everyone!
Love is never
Look at the
This postcard
Oh great,
Can I
Hmm, this is
I understand your
Oh, look at
B
I'm so
Whoa, this
W
Oh, this
Sometimes
This piece of
What the
That was a
Hey, do
Oh no
Whoa, what
I feel like I
The documentary
Hello
Hello little one
I understand that my
Eww, that
Wow, an
Yes! Finally,
Although the physical location
Whoever is watching
That movie
I remember wondering about
Hey there, little
Who's
Hello, who
Hello everyone! Thank
Hello, can
That's too
Hey, just wanted
Hey there, I
Saying good
Hey there!
Who is there?
Oh my good
I am very
Oh no, what
Wow, thank
I was promised
Hi, is
Hey, I'
Guys, the
Oh no, that
Who is there
Hello, this
That movie really touched
If you have something
The documentary was
I'm starting
Are you kidd
That movie really
Hey everyone,
Thank you for considering
I didn'
Yes! I
Can you
Oh my god
Hey, whoever
That melody really
Thank you, little
Hello, may I
Look
Wow, we
It looks
What do these
Oh wow
I apologize
What are you all
It's such
It's clear
Hey, I was
Hey friend,
I can only
The weather outside is
Eww, this
I miss you
Wow
Aww,
Hi, is there
This artwork
Okay,
Oh well,
This
I'
Say
Hey there little gu
Hmm,
Whoa, who
I am thr
Oh man
Okay, stay calm
I'm happy
Oh, this cur
Oh man,
I'm sorry
Hello? Who
What?! That
This piece
Hey everyone
That's so
Are you okay?
What happened? Where
Hi there
The
Who the hell entered
I can
Guys,
What's
What in
It's important
I'm
I'm coming
It'
Yes! Finally
Wait, what
Wow, reading
I'm surprised
Hey, did
Hey,
Okay, let
I understand that you
Who the hell threw
Eww, who
Thank you for thinking
Who is this?\"
I am deeply
Thank you for including
Oh no, an
It looks like you
Aww
I'm confused
Wow, it
That poem really
Yes
Hey there, is
Hey, what'
Thank you for remember
To
This is
Thank you for making
I can'
That mel
Wow, they
I feel like
Although the
Who are you
Love
If
What the hell are
I am so sad
Oh, I found
Thank you
It looks like
Well, life is
I appreciate that
The artist's
Whoa, that
It's never

View File

@ -0,0 +1,499 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include "pca.hpp"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
//////////////////////////////////////////////////
// utils
template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
std::string ret;
for (; begin != end; ++begin) {
ret += llama_token_to_piece(ctx, *begin);
}
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n");
}
//////////////////////////////////////////////////
// cb_eval is reused for each pair of positive - negative prompt
struct callback_data {
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
int n_layers = 0;
int n_tokens = 0;
bool is_eval_pos = true;
// each element of the vector correspond to one layer
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
void save_tensor_for_layer(struct ggml_tensor * t) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
if (ctx_ggml == nullptr) {
// alloc a new ctx_ggml if needed
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
}
// copy tensor data
auto n_bytes = ggml_nbytes(t);
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
ggml_set_name(t_layer, ggml_get_name(t));
//print_debug_tensor(t_layer);
if (is_eval_pos) {
v_pos.push_back(t_layer);
} else {
v_neg.push_back(t_layer);
}
}
// calculate diff (v_pos - v_neg) and place the result back to v_pos
// all zero rows in the diff tensor will also be removed
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
std::vector<struct ggml_tensor *> calc_diff() {
for (float il = 0; il < v_pos.size(); il++) {
float * a = (float *) v_pos[il]->data;
float * b = (float *) v_neg[il]->data;
size_t n_elem = ggml_nelements(v_pos[il]);
for (size_t j = 0; j < n_elem; j++) {
a[j] -= b[j];
}
//print_debug_tensor(v_pos[i]);
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
v_diff_filtered.push_back(diff_filtered);
}
return v_diff_filtered; // for convinient, we return the result std::vector
}
// delete zero rows from a given 2D tensor
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
//printf("filter_nonzero_rows\n");
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
// check if given row containing all zero elements
int n_cols = t->ne[0]; // hint: should be equal to n_embd
for (int col = 0; col < n_cols; ++col) {
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
return false;
}
}
return true;
};
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
if (!is_row_all_zeros(a, i_row, 1e-6)) {
rows_to_copy.push_back(i_row);
}
}
// get "n_nonzero_rows" for the output "diff_filtered"
int n_nonzero_rows = rows_to_copy.size();
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
int n_embd = a->ne[0];
GGML_ASSERT(n_nonzero_rows > 0);
// diff_filtered: [n_embd, n_nonzero_rows]
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
// copy non-zero rows
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
int src_row = rows_to_copy[dest_row];
for (int i = 0; i < n_embd; i++) {
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
}
}
//print_debug_tensor(diff_filtered);
return diff_filtered;
}
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
void reset() {
for (auto ptr : v_pos) free(ptr->data);
for (auto ptr : v_neg) free(ptr->data);
for (auto ptr : v_diff_filtered) free(ptr->data);
v_pos.clear();
v_neg.clear();
v_diff_filtered.clear();
if (ctx_ggml) {
ggml_free(ctx_ggml);
}
ctx_ggml = nullptr;
}
};
/**
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
* in short, input => v_diff and output => v_final
*/
struct train_context {
ggml_context * ctx_ggml;
int n_embd;
int n_layers;
/* pair of prompts to be used for generating final vector */
std::vector<std::string> positive_entries;
std::vector<std::string> negative_entries;
// each element of the vector correspond to one layer
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
// NOTE (2): v_diff is transposed from v_diff_tmp
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
// v_diff_tmp will get converted unto v_diff later on
std::vector<std::vector<uint8_t>> v_diff_tmp;
train_context(int n_embd_, int n_layers_) {
n_embd = n_embd_;
n_layers = n_layers_;
struct ggml_init_params params_ggml = {
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_ggml = ggml_init(params_ggml);
for (int il = 0; il < n_layers - 1; il++) {
std::vector<uint8_t> empty;
v_diff_tmp.push_back(empty);
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
v_final.push_back(t);
}
}
// add new rows into existing tensor in v_diff_tmp
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
for (int il = 0; il < n_layers - 1; il++) {
auto t = diff_filtered[il];
auto & diff_tmp = v_diff_tmp[il];
size_t curr_size = diff_tmp.size();
diff_tmp.resize(curr_size + ggml_nbytes(t));
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
}
}
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff() {
printf("build_v_diff\n");
for (int il = 0; il < n_layers - 1; il++) {
auto & diff_tmp = v_diff_tmp[il];
int n_elem = diff_tmp.size() / sizeof(float);
GGML_ASSERT(n_elem % n_embd == 0);
int n_rows = n_elem / n_embd;
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
// copy data & transpose
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
float f = arr[ir*n_embd + ic];
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
}
}
v_diff.push_back(diff);
print_debug_tensor(diff);
// free memory of diff_tmp
diff_tmp.resize(0);
}
}
~train_context() {
for (auto ptr : v_final) free(ptr->data);
for (auto ptr : v_diff) free(ptr->data);
// no need to free v_diff_tmp, since we didn't use malloc
ggml_free(ctx_ggml);
}
};
struct tokenized_prompt {
std::vector<llama_token> tokens_pos;
std::vector<llama_token> tokens_neg;
size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len);
}
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
// TODO: customize padding token
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
llama_token pad_tok = pad_tokens.back();
while (tokens.size() < len) {
tokens.push_back(pad_tok);
}
}
};
//////////////////////////////////////////////////
template <typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
std::vector<std::string> output;
std::ifstream file(path);
if (!file.is_open()) {
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
exit(1);
}
std::string line;
while (std::getline(file, line)) {
bool is_skip = skip_empty_lines && line.empty();
if (!is_skip) {
string_process_escapes(line);
output.push_back(line);
}
}
file.close();
return output;
}
//////////////////////////////////////////////////
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
static const char * l_out_name = "l_out";
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
if (ask) {
return is_l_out;
}
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
return true;
}
// save the tensor to current context
cb_data->save_tensor_for_layer(t);
return true;
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_cache_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
struct gguf_context * ctx = gguf_init_empty();
const std::string arch = "controlvector";
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
for (size_t i = 0; i < v_ctrl.size(); ++i) {
gguf_add_tensor(ctx, v_ctrl[i]);
print_debug_tensor(v_ctrl[i]);
printf("Added tensor: %s\n", v_ctrl[i]->name);
}
printf("%s: writing file...\n", __func__);
gguf_write_to_file(ctx, fname.c_str(), false);
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
gguf_free(ctx);
}
/**
* Load prompt files and completion file.
* Then format each pair of prompt + completion to make an entry.
*/
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
// load prompts
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
if (positive_prompts.size() != negative_prompts.size()) {
fprintf(stderr, "number of positive and negative prompts must be equal\n");
return 1;
}
if (positive_prompts.empty()) {
fprintf(stderr, "must provide at least one prompt pair\n");
return 1;
}
// create templated prompts
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
auto format_template = [](std::string persona, std::string suffix) {
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
return persona + suffix;
};
for (size_t i = 0; i < positive_prompts.size(); ++i) {
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
// TODO replicate the truncations done by the python implementation
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
}
}
return 0;
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
if (params.n_pca_iterations % params.n_pca_batch != 0) {
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
return 1;
}
callback_data cb_data;
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = cb_eval;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
print_build_info();
llama_backend_init();
llama_numa_init(params.numa);
// load the model to get hparams
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);
// get model hint param (a.k.a model arch name)
char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
// init train_context
train_context ctx_train(n_embd, n_layers);
// load and prepare entries for training
prepare_entries(params, ctx_train);
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
std::vector<tokenized_prompt> tokenized_prompts;
size_t n_total_tokens = 0;
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
n_total_tokens += 2 * t.max_seq_len;
tokenized_prompts.push_back(std::move(t));
}
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
bool success = false;
tokenized_prompt t = tokenized_prompts[i];
cb_data.n_layers = n_layers;
cb_data.n_tokens = t.max_seq_len;
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
(int) i+1, (int) ctx_train.positive_entries.size(),
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
(int) t.max_seq_len);
cb_data.is_eval_pos = true;
success = get_hidden_layers(ctx, t.tokens_pos);
if (!success) break;
cb_data.is_eval_pos = false;
success = get_hidden_layers(ctx, t.tokens_neg);
if (!success) break;
// calculate diff and remove all zero rows
auto v_diff_filtered = cb_data.calc_diff();
// save & concat the filtered v_diff to ctx_train
ctx_train.concat_diff_tmp(v_diff_filtered);
// reset for next iteration
cb_data.reset();
}
// done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);
// prepare ctx_train for PCA
ctx_train.build_v_diff();
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
llama_backend_free();
return 0;
}

View File

@ -0,0 +1 @@
[INST] Act like a person who is extremely sad. [/INST]

View File

@ -0,0 +1,322 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cstdio>
#include <ctime>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
if (!with_data) return;
printf("%s: %s[0] = [", __func__, t->name);
for (size_t i = 0; i <= DEBUG_POS; i++) {
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
}
printf(" ... ]\n");
}
namespace PCA {
// input params for PCA computations
struct pca_params {
int n_threads = 1;
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
int n_iterations = 1000;
float tolerance = 1e-7;
// for debugging
int i_layer = 0;
int n_layers = 0;
};
// result from each iteration
struct pca_result {
struct ggml_tensor * calculated_square = NULL;
std::vector<struct ggml_tensor *> eigenvectors;
std::vector<float> distances;
};
struct pca_model {
ggml_backend_t backend = NULL;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx; // context to compute graph on target device
struct ggml_context * ctx_host; // host context to store results
// tensors on target device
struct ggml_tensor * dev_input;
struct ggml_tensor * dev_square;
struct ggml_tensor * dev_eigenvector;
pca_model(struct ggml_tensor * t_input) {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// TODO: enable Metal support when support for GGML_OP_SQRT is added
// #ifdef GGML_USE_METAL
// fprintf(stderr, "%s: using Metal backend\n", __func__);
// backend = ggml_backend_metal_init();
// if (!backend) {
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
// }
// #endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
}
const int num_tensors = 4;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx = ggml_init(params);
auto n_samples = t_input->ne[0];
auto n_embd = t_input->ne[1];
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
ggml_set_name(dev_input, "dev_input");
ggml_set_name(dev_square, "dev_square");
ggml_set_name(dev_eigenvector, "dev_eigenvector");
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
// initialize eigenvector to random normalized vector
{
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
std::uniform_real_distribution<float> distribution(0.0, 1.0);
float sum_sqr = 0.0; // for normalizing random_vec
for (size_t i = 0; i < random_vec.size(); ++i) {
float f = distribution(generator);
sum_sqr += f * f;
random_vec[i] = f;
}
// normalize it
float random_vec_norm = std::sqrt(sum_sqr);
for (size_t i = 0; i < random_vec.size(); ++i) {
random_vec[i] /= random_vec_norm;
}
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
}
}
~pca_model() {
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
};
static struct ggml_cgraph * build_graph_piter(
const struct pca_params & params,
const pca_model & model,
bool calc_square = false) {
GGML_ASSERT(params.n_batch > 0);
// TODO: buf_size must be able to scale with params.n_batch
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// turn v_diff_original into square matrix if needed
struct ggml_tensor * tmp_square;
if (calc_square) {
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
ggml_set_name(tmp_square, "tmp_square");
}
struct ggml_tensor * b_tensor;
struct ggml_tensor * distance;
struct ggml_tensor * old_eigen = model.dev_eigenvector;
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
for (int i = 0; i < params.n_batch; ++i) {
// b_tensor = square * eigenvector^T
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
ggml_set_name(b_tensor, "b_tensor");
// normalize
b_tensor = ggml_div_inplace(ctx0,
b_tensor,
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
);
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
// calculate distance(new eigenvector - old eigenvector)
// we don't use ggml_sub because it may not be implemented on GPU backend
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
distance = ggml_sqrt_inplace(ctx0,
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
ggml_format_name(distance, "distance_%d", i);
old_eigen = b_tensor;
// build operations nodes
ggml_build_forward_expand(gf, distance);
}
// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}
static ggml_status compute_piter(
const struct pca_params & params,
const pca_model & model,
struct ggml_cgraph * gf,
ggml_gallocr_t allocr,
struct pca_result & result) {
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
}
// TODO: enable GPU support when support for GGML_OP_SQRT is added
//#ifdef GGML_USE_METAL
// if (ggml_backend_is_metal(model.backend)) {
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
// }
//#endif
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
if (res == GGML_STATUS_SUCCESS) {
auto extract_i = [](std::string prefix, std::string str) -> int {
int i = -1;
if (str.rfind(prefix, 0) == 0) {
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
}
return i;
};
result.calculated_square = NULL;
result.eigenvectors.clear();
result.distances.clear();
result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch);
// get output nodes
for (int i = 0; i < gf->n_nodes; ++i) {
auto node = gf->nodes[i];
int iter = -1;
// find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
result.eigenvectors[iter] = node;
}
// find distances, then copy data from device
if ((iter = extract_i("distance_", node->name)) > -1) {
float d;
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
result.distances[iter] = d;
// std::cout << node->name << " = " << d << "\n";
}
// find tmp_square if it exists (without copying data from device)
if (std::string(node->name) == "tmp_square") {
result.calculated_square = node;
}
}
}
return res;
}
static void power_iteration(
const struct pca_params & params,
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
struct ggml_tensor * output) {
//printf("in power iteration\n");
struct pca_model model(input);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
struct pca_result result;
struct ggml_tensor * last_eigenvector = NULL;
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
for (int iter = 0; iter < n_iters; ++iter) {
bool calc_square = (iter == 0); // only need to calculate square for first iteration
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
compute_piter(params, model, gf, allocr, result);
for (size_t k = 0; k < result.distances.size(); ++k) {
last_eigenvector = result.eigenvectors[k];
if (result.distances[k] < params.tolerance) {
break; // done
}
}
if (calc_square) {
// copy and store the square matrix if needed
GGML_ASSERT(result.calculated_square != NULL);
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
}
{
// copy last eigen vector and store as input for next iteration
GGML_ASSERT(last_eigenvector != NULL);
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
}
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
}
// get output tensor
GGML_ASSERT(last_eigenvector);
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
//print_debug_tensor(output);
ggml_gallocr_free(allocr);
}
static void run_pca(
struct pca_params & params,
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running PCA...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// run power_iteration
params.i_layer = il;
params.n_layers = v_input.size();
power_iteration(params, v_input[il], ctrl_out);
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View File

@ -0,0 +1 @@
[INST] Act like a person who is extremely happy. [/INST]

View File

@ -1,4 +1,4 @@
set(TARGET embedding)
set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
```
### Windows:
```powershell
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
```
The above command will output space-separated float values.

View File

@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}
@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
@ -63,6 +58,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
@ -79,9 +75,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);
@ -99,6 +92,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);

View File

@ -1,9 +1,9 @@
set(TARGET eval-callback)
set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
Usage:
```shell
eval-callback \
llama-eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \

View File

@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);

View File

@ -1,4 +1,4 @@
set(TARGET export-lora)
set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -3,7 +3,7 @@
Apply LORA adapters to base model and export the resulting model.
```
usage: export-lora [options]
usage: llama-export-lora [options]
options:
-h, --help show this help message and exit
@ -17,7 +17,7 @@ options:
For example:
```bash
./bin/export-lora \
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin

View File

@ -1,4 +1,4 @@
set(TARGET finetune)
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -7,7 +7,7 @@ Basic usage instructions:
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter
./bin/finetune \
./bin/llama-finetune \
--model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--use-checkpointing
# predict
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -38,14 +38,14 @@ After 10 more iterations:
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
In `main` you can also load multiple LORA adapters, which will then be mixed together.
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin

View File

@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0;
return ggml_rope_ext(ctx,
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};

View File

@ -2,7 +2,7 @@
cd `dirname $0`
cd ../..
EXE="./finetune"
EXE="./llama-finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi

View File

@ -1,5 +1,5 @@
set(TARGET gbnf-validator)
set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -7,6 +7,8 @@
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <fstream>
#include <string>
#include <vector>
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
return 1;
}
fseek(grammar_file, 0, SEEK_END);
size_t grammar_size = ftell(grammar_file);
fseek(grammar_file, 0, SEEK_SET);
std::string grammar_str(grammar_size, ' ');
fread(&grammar_str[0], 1, grammar_size, grammar_file);
fclose(grammar_file);
std::string grammar_str;
{
std::ifstream grammar_file(grammar_filename);
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
std::stringstream buffer;
buffer << grammar_file.rdbuf();
grammar_str = buffer.str();
}
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Read the input file
FILE* input_file = fopen(input_filename.c_str(), "r");
if (!input_file) {
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
return 1;
std::string input_str;
{
std::ifstream input_file(input_filename);
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
std::stringstream buffer;
buffer << input_file.rdbuf();
input_str = buffer.str();
}
fseek(input_file, 0, SEEK_END);
size_t input_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
std::string input_str(input_size, ' ');
fread(&input_str[0], 1, input_size, input_file);
fclose(input_file);
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;

View File

@ -1,4 +1,4 @@
set(TARGET gguf-split)
set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View File

@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes
n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
total_size = total_size / 1024 / 1024; // convert to megabytes
total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}

Some files were not shown because too many files have changed in this diff Show More