mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-06 08:30:33 +01:00
Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
cf12ada7f2
81
.devops/cpu.Dockerfile
Normal file
81
.devops/cpu.Dockerfile
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
94
.devops/cuda.Dockerfile
Normal file
94
.devops/cuda.Dockerfile
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
|
||||||
cp build/bin/* .
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
|
||||||
cp build/bin/* .
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,50 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt /app/requirements.txt
|
|
||||||
COPY requirements /app/requirements
|
|
||||||
COPY .devops/tools.sh /app/tools.sh
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel && \
|
|
||||||
pip install -r /app/requirements.txt
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/ /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
COPY --from=build /app/convert_hf_to_gguf.py /app/
|
|
||||||
COPY --from=build /app/gguf-py /app/gguf-py
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
91
.devops/intel.Dockerfile
Normal file
91
.devops/intel.Dockerfile
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
## Build Image
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" \
|
||||||
|
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
echo "Building with dynamic libs" && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,28 +0,0 @@
|
|||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with static libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
|
||||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,45 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -1,27 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
|
||||||
|
|
||||||
# Install Vulkan SDK
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,29 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -1,43 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,34 +0,0 @@
|
|||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,43 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,54 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -1,31 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
108
.devops/musa.Dockerfile
Normal file
108
.devops/musa.Dockerfile
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG MUSA_VERSION=rc3.1.0
|
||||||
|
# Target the MUSA build image
|
||||||
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# MUSA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG MUSA_DOCKER_ARCH=default
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
git \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Use the default MUSA archs if not specified
|
||||||
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
113
.devops/rocm.Dockerfile
Normal file
113
.devops/rocm.Dockerfile
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
ARG UBUNTU_VERSION=24.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=6.3
|
||||||
|
ARG AMDGPU_VERSION=6.3
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
### Build image
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||||
|
# gfx906 is deprecated
|
||||||
|
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
||||||
|
|
||||||
|
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||||
|
ARG ROCM_DOCKER_ARCH=gfx1100
|
||||||
|
|
||||||
|
# Set nvcc architectured
|
||||||
|
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
# ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
git \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
curl \
|
||||||
|
libgomp1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||||
|
&& cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib \
|
||||||
|
&& find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3-pip \
|
||||||
|
python3 \
|
||||||
|
python3-wheel\
|
||||||
|
&& pip install --break-system-packages --upgrade setuptools \
|
||||||
|
&& pip install --break-system-packages -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
88
.devops/vulkan.Dockerfile
Normal file
88
.devops/vulkan.Dockerfile
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK and cURL
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
104
.github/workflows/docker.yml
vendored
104
.github/workflows/docker.yml
vendored
@ -34,21 +34,14 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Multi-stage build
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
||||||
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@ -56,10 +49,10 @@ jobs:
|
|||||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v2
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v2
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
@ -79,25 +72,34 @@ jobs:
|
|||||||
|
|
||||||
# determine tag name postfix (build number, commit hash)
|
# determine tag name postfix (build number, commit hash)
|
||||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||||
TAG_POSTFIX="b${BUILD_NUMBER}"
|
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
||||||
else
|
else
|
||||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||||
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
|
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# list all tags possible
|
# list all tags possible
|
||||||
TAGS=""
|
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
|
TYPE=""
|
||||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
|
else
|
||||||
|
TYPE="-${{ matrix.config.tag }}"
|
||||||
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
|
fi
|
||||||
echo "output_tags=$TAGS" # print out for debugging
|
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||||
|
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||||
|
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||||
|
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||||
|
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
||||||
|
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||||
|
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||||
env:
|
env:
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
|
if: ${{ matrix.config.free_disk_space == true }}
|
||||||
uses: jlumbroso/free-disk-space@main
|
uses: jlumbroso/free-disk-space@main
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
@ -113,13 +115,59 @@ jobs:
|
|||||||
docker-images: true
|
docker-images: true
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged + versioned)
|
- name: Build and push Full Docker image (tagged + versioned)
|
||||||
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
# tag list is generated from step above
|
||||||
tags: ${{ steps.tag.outputs.output_tags }}
|
tags: ${{ steps.tag.outputs.full_output_tags }}
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: full
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
|
||||||
|
- name: Build and push Light Docker image (tagged + versioned)
|
||||||
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
# tag list is generated from step above
|
||||||
|
tags: ${{ steps.tag.outputs.light_output_tags }}
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: light
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
|
||||||
|
- name: Build and push Server Docker image (tagged + versioned)
|
||||||
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
# tag list is generated from step above
|
||||||
|
tags: ${{ steps.tag.outputs.server_output_tags }}
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: server
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
@ -529,9 +529,19 @@ class Model:
|
|||||||
else:
|
else:
|
||||||
token: str = reverse_vocab[i]
|
token: str = reverse_vocab[i]
|
||||||
if token in added_vocab:
|
if token in added_vocab:
|
||||||
|
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
||||||
|
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
||||||
|
if not tokenizer.added_tokens_decoder[i].normalized:
|
||||||
|
previous_token = token
|
||||||
|
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||||
|
if previous_token != token:
|
||||||
|
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||||
|
|
||||||
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
else:
|
else:
|
||||||
|
# NOTE: this was added for Gemma.
|
||||||
|
# Encoding and decoding the tokens above isn't sufficient for this case.
|
||||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
else:
|
else:
|
||||||
@ -575,6 +585,9 @@ class Model:
|
|||||||
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
||||||
# ref: https://huggingface.co/tiiuae/falcon-7b
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
||||||
res = "falcon"
|
res = "falcon"
|
||||||
|
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||||
|
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
||||||
|
res = "falcon3"
|
||||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
||||||
res = "bert-bge"
|
res = "bert-bge"
|
||||||
@ -671,6 +684,9 @@ class Model:
|
|||||||
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
||||||
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
||||||
res = "gigachat"
|
res = "gigachat"
|
||||||
|
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
||||||
|
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
||||||
|
res = "megrez"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -1679,6 +1695,184 @@ class LlamaModel(Model):
|
|||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeciLMForCausalLM")
|
||||||
|
class DeciModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DECI
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
||||||
|
# DeciLM-specific code
|
||||||
|
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
||||||
|
return DeciModel._find_multiple(intermediate_size, 256)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_multiple(n: int, k: int) -> int:
|
||||||
|
# DeciLM-specific code
|
||||||
|
if n % k == 0:
|
||||||
|
return n
|
||||||
|
return n + k - (n % k)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
||||||
|
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
||||||
|
assert self.block_count == len(_block_configs)
|
||||||
|
self._num_kv_heads = list()
|
||||||
|
self._num_heads = list()
|
||||||
|
_ffn_multipliers = list()
|
||||||
|
# ***linear attention layer***
|
||||||
|
# if n_heads_in_group is None and replace_with_linear is True
|
||||||
|
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
||||||
|
# ***attention-free layer***
|
||||||
|
# if n_heads_in_group is None and replace_with_linear is False
|
||||||
|
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
||||||
|
# ***normal attention-layer***
|
||||||
|
# if n_heads_in_group is not None, then
|
||||||
|
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
||||||
|
# _num_heads[il] is num_attention_head
|
||||||
|
for il in range(len(_block_configs)):
|
||||||
|
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
||||||
|
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
||||||
|
self._num_kv_heads.append(0)
|
||||||
|
self._num_heads.append(self.hparams["num_attention_heads"])
|
||||||
|
else:
|
||||||
|
self._num_kv_heads.append(0)
|
||||||
|
self._num_heads.append(0)
|
||||||
|
else:
|
||||||
|
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
||||||
|
self._num_heads.append(self.hparams["num_attention_heads"])
|
||||||
|
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
assert self.block_count == len(self._num_heads)
|
||||||
|
assert self.block_count == len(_ffn_multipliers)
|
||||||
|
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
||||||
|
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
||||||
|
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
||||||
|
self._ffn_dims: list[int] = [
|
||||||
|
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
||||||
|
for multiplier in _ffn_multipliers
|
||||||
|
]
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
||||||
|
# eos_token from '|eot_id|' to '|end_of_text|'
|
||||||
|
if self.hparams.get("vocab_size", 128256) == 128256:
|
||||||
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(
|
||||||
|
self.dir_model, load_merges=True,
|
||||||
|
special_token_types = ['bos', 'eos', 'eom', 'eot']
|
||||||
|
)
|
||||||
|
special_vocab._set_special_token("bos", 128000)
|
||||||
|
special_vocab._set_special_token("eos", 128001)
|
||||||
|
special_vocab._set_special_token("eom", 128008)
|
||||||
|
special_vocab._set_special_token("eot", 128009)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
else:
|
||||||
|
# DeciLM-7B
|
||||||
|
self._set_vocab_llama_hf()
|
||||||
|
# self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
assert self.block_count == len(self._num_heads)
|
||||||
|
assert self.block_count == len(self._ffn_dims)
|
||||||
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
else: # DeciLM-7B
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
||||||
|
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
|
hparams = self.hparams
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
|
if "head_dim" in hparams:
|
||||||
|
rope_dim = hparams["head_dim"]
|
||||||
|
else:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
n_head = n_head_kv
|
||||||
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
if bid is not None:
|
||||||
|
if "num_key_value_heads_per_layer" in self.hparams:
|
||||||
|
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
||||||
|
elif "block_configs" in self.hparams:
|
||||||
|
n_kv_head = self._num_kv_heads[bid]
|
||||||
|
n_head = self._num_heads[bid]
|
||||||
|
else:
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
else:
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
|
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
||||||
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
||||||
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
||||||
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
high_freq_wavelen = old_context_len / high_freq_factor
|
||||||
|
assert low_freq_wavelen != high_freq_wavelen
|
||||||
|
|
||||||
|
rope_factors = []
|
||||||
|
for freq in freqs:
|
||||||
|
wavelen = 2 * math.pi / freq
|
||||||
|
if wavelen < high_freq_wavelen:
|
||||||
|
rope_factors.append(1)
|
||||||
|
elif wavelen > low_freq_wavelen:
|
||||||
|
rope_factors.append(factor)
|
||||||
|
else:
|
||||||
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BitnetForCausalLM")
|
@Model.register("BitnetForCausalLM")
|
||||||
class BitnetModel(Model):
|
class BitnetModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BITNET
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
@ -2628,7 +2822,7 @@ class InternLM2Model(Model):
|
|||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BertModel", "CamembertModel")
|
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
||||||
class BertModel(Model):
|
class BertModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
@ -2694,10 +2888,25 @@ class BertModel(Model):
|
|||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("bert."):
|
||||||
|
name = name[5:]
|
||||||
|
|
||||||
|
if name.endswith(".gamma"):
|
||||||
|
name = name[:-6] + ".weight"
|
||||||
|
|
||||||
|
if name.endswith(".beta"):
|
||||||
|
name = name[:-5] + ".bias"
|
||||||
|
|
||||||
# we are only using BERT for embeddings so we don't need the pooling layer
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
||||||
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
||||||
return [] # we don't need these
|
return [] # we don't need these
|
||||||
|
|
||||||
|
if name.startswith("cls.predictions"):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("cls.seq_relationship"):
|
||||||
|
return []
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,6 +72,7 @@ models = [
|
|||||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||||
|
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
|
||||||
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
||||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||||
@ -105,6 +106,7 @@ models = [
|
|||||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||||
|
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ static void run(
|
|||||||
for (size_t il = 0; il < v_input.size(); ++il) {
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
// prepare output vector
|
// prepare output vector
|
||||||
struct ggml_tensor * ctrl_out = v_output[il];
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
ggml_format_name(ctrl_out, "direction.%zu", il+1);
|
||||||
|
|
||||||
// calculate mean vector
|
// calculate mean vector
|
||||||
struct ggml_tensor * t_layer = v_input[il];
|
struct ggml_tensor * t_layer = v_input[il];
|
||||||
|
@ -302,7 +302,7 @@ static void run_pca(
|
|||||||
|
|
||||||
// prepare output vector
|
// prepare output vector
|
||||||
struct ggml_tensor * ctrl_out = v_output[il];
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
ggml_format_name(ctrl_out, "direction.%zu", il+1);
|
||||||
|
|
||||||
// run power_iteration
|
// run power_iteration
|
||||||
params.i_layer = il;
|
params.i_layer = il;
|
||||||
|
@ -265,8 +265,8 @@ struct lora_merge_ctx {
|
|||||||
fout.write((const char *)data.data(), data.size());
|
fout.write((const char *)data.data(), data.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
|
||||||
printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
|
printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_tensor(struct ggml_tensor * base) {
|
void copy_tensor(struct ggml_tensor * base) {
|
||||||
@ -352,7 +352,7 @@ struct lora_merge_ctx {
|
|||||||
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
||||||
delta = ggml_scale(ctx0, delta, scale);
|
delta = ggml_scale(ctx0, delta, scale);
|
||||||
cur = ggml_add(ctx0, delta, cur);
|
cur = ggml_add(ctx0, delta, cur);
|
||||||
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
||||||
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
||||||
}
|
}
|
||||||
cur = ggml_cast(ctx0, cur, out->type);
|
cur = ggml_cast(ctx0, cur, out->type);
|
||||||
|
@ -12,6 +12,10 @@
|
|||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# include <windows.h>
|
# include <windows.h>
|
||||||
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
|
|||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
||||||
}
|
}
|
||||||
|
#elif GGML_USE_SYCL
|
||||||
|
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
||||||
|
backend = ggml_backend_sycl_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// if there aren't GPU Backends fallback to CPU backend
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|||||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||||
#elif GGML_USE_VULKAN
|
#elif GGML_USE_VULKAN
|
||||||
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
||||||
|
#elif GGML_USE_SYCL
|
||||||
|
ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
|
||||||
#else
|
#else
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
MEMORYSTATUSEX status;
|
MEMORYSTATUSEX status;
|
||||||
|
@ -19,6 +19,8 @@ Options:
|
|||||||
Context size (default: 2048)
|
Context size (default: 2048)
|
||||||
-n, --ngl <value>
|
-n, --ngl <value>
|
||||||
Number of GPU layers (default: 0)
|
Number of GPU layers (default: 0)
|
||||||
|
--temp <value>
|
||||||
|
Temperature (default: 0.8)
|
||||||
-v, --verbose, --log-verbose
|
-v, --verbose, --log-verbose
|
||||||
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
|
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
|
||||||
-h, --help
|
-h, --help
|
||||||
|
@ -55,29 +55,51 @@ static int printe(const char * fmt, ...) {
|
|||||||
class Opt {
|
class Opt {
|
||||||
public:
|
public:
|
||||||
int init(int argc, const char ** argv) {
|
int init(int argc, const char ** argv) {
|
||||||
|
ctx_params = llama_context_default_params();
|
||||||
|
model_params = llama_model_default_params();
|
||||||
|
context_size_default = ctx_params.n_batch;
|
||||||
|
ngl_default = model_params.n_gpu_layers;
|
||||||
|
common_params_sampling sampling;
|
||||||
|
temperature_default = sampling.temp;
|
||||||
|
|
||||||
|
if (argc < 2) {
|
||||||
|
printe("Error: No arguments provided.\n");
|
||||||
|
print_help();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Parse arguments
|
// Parse arguments
|
||||||
if (parse(argc, argv)) {
|
if (parse(argc, argv)) {
|
||||||
printe("Error: Failed to parse arguments.\n");
|
printe("Error: Failed to parse arguments.\n");
|
||||||
help();
|
print_help();
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If help is requested, show help and exit
|
// If help is requested, show help and exit
|
||||||
if (help_) {
|
if (help) {
|
||||||
help();
|
print_help();
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||||
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||||
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||||
|
|
||||||
return 0; // Success
|
return 0; // Success
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params;
|
||||||
|
llama_model_params model_params;
|
||||||
std::string model_;
|
std::string model_;
|
||||||
std::string user_;
|
std::string user;
|
||||||
int context_size_ = -1, ngl_ = -1;
|
int context_size = -1, ngl = -1;
|
||||||
bool verbose_ = false;
|
float temperature = -1;
|
||||||
|
bool verbose = false;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool help_ = false;
|
int context_size_default = -1, ngl_default = -1;
|
||||||
|
float temperature_default = -1;
|
||||||
|
bool help = false;
|
||||||
|
|
||||||
bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
|
bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
|
||||||
return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
|
return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
|
||||||
@ -89,6 +111,17 @@ class Opt {
|
|||||||
}
|
}
|
||||||
|
|
||||||
option_value = std::atoi(argv[++i]);
|
option_value = std::atoi(argv[++i]);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
|
||||||
|
if (i + 1 >= argc) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
option_value = std::atof(argv[++i]);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,18 +129,22 @@ class Opt {
|
|||||||
bool options_parsing = true;
|
bool options_parsing = true;
|
||||||
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
||||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||||
if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
|
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||||
if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
|
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||||
|
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (options_parsing &&
|
} else if (options_parsing &&
|
||||||
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||||
verbose_ = true;
|
verbose = true;
|
||||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||||
help_ = true;
|
help = true;
|
||||||
return 0;
|
return 0;
|
||||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||||
options_parsing = false;
|
options_parsing = false;
|
||||||
@ -120,16 +157,16 @@ class Opt {
|
|||||||
model_ = argv[i];
|
model_ = argv[i];
|
||||||
} else if (positional_args_i == 1) {
|
} else if (positional_args_i == 1) {
|
||||||
++positional_args_i;
|
++positional_args_i;
|
||||||
user_ = argv[i];
|
user = argv[i];
|
||||||
} else {
|
} else {
|
||||||
user_ += " " + std::string(argv[i]);
|
user += " " + std::string(argv[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void help() const {
|
void print_help() const {
|
||||||
printf(
|
printf(
|
||||||
"Description:\n"
|
"Description:\n"
|
||||||
" Runs a llm\n"
|
" Runs a llm\n"
|
||||||
@ -142,6 +179,8 @@ class Opt {
|
|||||||
" Context size (default: %d)\n"
|
" Context size (default: %d)\n"
|
||||||
" -n, --ngl <value>\n"
|
" -n, --ngl <value>\n"
|
||||||
" Number of GPU layers (default: %d)\n"
|
" Number of GPU layers (default: %d)\n"
|
||||||
|
" --temp <value>\n"
|
||||||
|
" Temperature (default: %.1f)\n"
|
||||||
" -v, --verbose, --log-verbose\n"
|
" -v, --verbose, --log-verbose\n"
|
||||||
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
||||||
" -h, --help\n"
|
" -h, --help\n"
|
||||||
@ -170,7 +209,7 @@ class Opt {
|
|||||||
" llama-run file://some-file3.gguf\n"
|
" llama-run file://some-file3.gguf\n"
|
||||||
" llama-run --ngl 999 some-file4.gguf\n"
|
" llama-run --ngl 999 some-file4.gguf\n"
|
||||||
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
||||||
llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
|
context_size_default, ngl_default, temperature_default);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -495,12 +534,12 @@ class LlamaData {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
context = initialize_context(model, opt.context_size_);
|
context = initialize_context(model, opt);
|
||||||
if (!context) {
|
if (!context) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
sampler = initialize_sampler();
|
sampler = initialize_sampler(opt);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -619,14 +658,12 @@ class LlamaData {
|
|||||||
// Initializes the model and returns a unique pointer to it
|
// Initializes the model and returns a unique pointer to it
|
||||||
llama_model_ptr initialize_model(Opt & opt) {
|
llama_model_ptr initialize_model(Opt & opt) {
|
||||||
ggml_backend_load_all();
|
ggml_backend_load_all();
|
||||||
llama_model_params model_params = llama_model_default_params();
|
|
||||||
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
|
|
||||||
resolve_model(opt.model_);
|
resolve_model(opt.model_);
|
||||||
printe(
|
printe(
|
||||||
"\r%*s"
|
"\r%*s"
|
||||||
"\rLoading model",
|
"\rLoading model",
|
||||||
get_terminal_width(), " ");
|
get_terminal_width(), " ");
|
||||||
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
|
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
|
||||||
if (!model) {
|
if (!model) {
|
||||||
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||||
}
|
}
|
||||||
@ -636,10 +673,8 @@ class LlamaData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initializes the context with the specified parameters
|
// Initializes the context with the specified parameters
|
||||||
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
|
llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
|
||||||
ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
|
|
||||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
|
||||||
if (!context) {
|
if (!context) {
|
||||||
printe("%s: error: failed to create the llama_context\n", __func__);
|
printe("%s: error: failed to create the llama_context\n", __func__);
|
||||||
}
|
}
|
||||||
@ -648,10 +683,10 @@ class LlamaData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initializes and configures the sampler
|
// Initializes and configures the sampler
|
||||||
llama_sampler_ptr initialize_sampler() {
|
llama_sampler_ptr initialize_sampler(const Opt & opt) {
|
||||||
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||||
|
|
||||||
return sampler;
|
return sampler;
|
||||||
@ -798,9 +833,9 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to handle user input
|
// Helper function to handle user input
|
||||||
static int handle_user_input(std::string & user_input, const std::string & user_) {
|
static int handle_user_input(std::string & user_input, const std::string & user) {
|
||||||
if (!user_.empty()) {
|
if (!user.empty()) {
|
||||||
user_input = user_;
|
user_input = user;
|
||||||
return 0; // No need for interactive input
|
return 0; // No need for interactive input
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,17 +867,17 @@ static bool is_stdout_a_terminal() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Function to tokenize the prompt
|
// Function to tokenize the prompt
|
||||||
static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||||
int prev_len = 0;
|
int prev_len = 0;
|
||||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||||
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
||||||
while (true) {
|
while (true) {
|
||||||
// Get user input
|
// Get user input
|
||||||
std::string user_input;
|
std::string user_input;
|
||||||
while (handle_user_input(user_input, user_)) {
|
while (handle_user_input(user_input, user)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
add_message("user", user_.empty() ? user_input : user_, llama_data);
|
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||||
int new_len;
|
int new_len;
|
||||||
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -854,7 +889,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!user_.empty()) {
|
if (!user.empty()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -869,7 +904,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
|||||||
|
|
||||||
static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
|
static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
|
||||||
const Opt * opt = static_cast<Opt *>(p);
|
const Opt * opt = static_cast<Opt *>(p);
|
||||||
if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
|
if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
|
||||||
printe("%s", text);
|
printe("%s", text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -890,11 +925,11 @@ int main(int argc, const char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!is_stdin_a_terminal()) {
|
if (!is_stdin_a_terminal()) {
|
||||||
if (!opt.user_.empty()) {
|
if (!opt.user.empty()) {
|
||||||
opt.user_ += "\n\n";
|
opt.user += "\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
opt.user_ += read_pipe_data();
|
opt.user += read_pipe_data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_log_set(log_callback, &opt);
|
llama_log_set(log_callback, &opt);
|
||||||
@ -903,7 +938,7 @@ int main(int argc, const char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chat_loop(llama_data, opt.user_)) {
|
if (chat_loop(llama_data, opt.user)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ endforeach()
|
|||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
|
||||||
|
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
|
@ -450,6 +450,8 @@ These words will not be included in the completion, so make sure to add them to
|
|||||||
|
|
||||||
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
|
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
|
||||||
|
|
||||||
|
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
|
||||||
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||||
@ -724,7 +726,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
|
|||||||
},
|
},
|
||||||
"total_slots": 1,
|
"total_slots": 1,
|
||||||
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||||
"chat_template": "..."
|
"chat_template": "...",
|
||||||
|
"build_info": "b(build number)-(build commit hash)"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -92,6 +92,7 @@ struct slot_params {
|
|||||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
|
std::vector<std::string> response_fields;
|
||||||
bool timings_per_token = false;
|
bool timings_per_token = false;
|
||||||
bool post_sampling_probs = false;
|
bool post_sampling_probs = false;
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
@ -209,6 +210,7 @@ struct server_task {
|
|||||||
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
||||||
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
||||||
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
||||||
|
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
|
||||||
|
|
||||||
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
||||||
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
||||||
@ -522,6 +524,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
|
|
||||||
bool post_sampling_probs;
|
bool post_sampling_probs;
|
||||||
std::vector<completion_token_output> probs_output;
|
std::vector<completion_token_output> probs_output;
|
||||||
|
std::vector<std::string> response_fields;
|
||||||
|
|
||||||
slot_params generation_params;
|
slot_params generation_params;
|
||||||
|
|
||||||
@ -568,7 +571,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
if (!stream && !probs_output.empty()) {
|
if (!stream && !probs_output.empty()) {
|
||||||
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
|
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
|
||||||
}
|
}
|
||||||
return res;
|
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_oaicompat_chat() {
|
json to_json_oaicompat_chat() {
|
||||||
@ -598,6 +601,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
{"choices", json::array({choice})},
|
{"choices", json::array({choice})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
|
{"system_fingerprint", build_info},
|
||||||
{"object", "chat.completion"},
|
{"object", "chat.completion"},
|
||||||
{"usage", json {
|
{"usage", json {
|
||||||
{"completion_tokens", n_decoded},
|
{"completion_tokens", n_decoded},
|
||||||
@ -636,6 +640,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
|
{"system_fingerprint", build_info},
|
||||||
{"object", "chat.completion.chunk"},
|
{"object", "chat.completion.chunk"},
|
||||||
{"usage", json {
|
{"usage", json {
|
||||||
{"completion_tokens", n_decoded},
|
{"completion_tokens", n_decoded},
|
||||||
@ -765,6 +770,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
|
{"system_fingerprint", build_info},
|
||||||
{"object", "chat.completion.chunk"}
|
{"object", "chat.completion.chunk"}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1850,6 +1856,8 @@ struct server_context {
|
|||||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
slot.n_sent_text += result.text_to_send.size();
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
// add the token to slot queue and cache
|
// add the token to slot queue and cache
|
||||||
|
} else {
|
||||||
|
result.text_to_send = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.add_token(result);
|
slot.add_token(result);
|
||||||
@ -2063,6 +2071,7 @@ struct server_context {
|
|||||||
res->tokens = slot.generated_tokens;
|
res->tokens = slot.generated_tokens;
|
||||||
res->timings = slot.get_timings();
|
res->timings = slot.get_timings();
|
||||||
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
|
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
|
||||||
|
res->response_fields = slot.params.response_fields;
|
||||||
|
|
||||||
res->truncated = slot.truncated;
|
res->truncated = slot.truncated;
|
||||||
res->n_decoded = slot.n_decoded;
|
res->n_decoded = slot.n_decoded;
|
||||||
@ -3476,6 +3485,7 @@ int main(int argc, char ** argv) {
|
|||||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||||
{ "model_path", ctx_server.params_base.model },
|
{ "model_path", ctx_server.params_base.model },
|
||||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||||
|
{ "build_info", build_info },
|
||||||
};
|
};
|
||||||
|
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
@ -3697,7 +3707,7 @@ int main(int argc, char ** argv) {
|
|||||||
{"object", "list"},
|
{"object", "list"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{
|
{
|
||||||
{"id", params.model_alias},
|
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
||||||
{"object", "model"},
|
{"object", "model"},
|
||||||
{"created", std::time(0)},
|
{"created", std::time(0)},
|
||||||
{"owned_by", "llamacpp"},
|
{"owned_by", "llamacpp"},
|
||||||
@ -3782,6 +3792,17 @@ int main(int argc, char ** argv) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool use_base64 = false;
|
||||||
|
if (body.count("encoding_format") != 0) {
|
||||||
|
const std::string& format = body.at("encoding_format");
|
||||||
|
if (format == "base64") {
|
||||||
|
use_base64 = true;
|
||||||
|
} else if (format != "float") {
|
||||||
|
res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
||||||
for (const auto & tokens : tokenized_prompts) {
|
for (const auto & tokens : tokenized_prompts) {
|
||||||
// this check is necessary for models that do not add BOS token to the input
|
// this check is necessary for models that do not add BOS token to the input
|
||||||
@ -3833,7 +3854,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// write JSON response
|
// write JSON response
|
||||||
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
|
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses);
|
||||||
res_ok(res, root);
|
res_ok(res, root);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
|||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
||||||
|
assert res.body["system_fingerprint"].startswith("b")
|
||||||
assert res.body["model"] == model if model is not None else server.model_alias
|
assert res.body["model"] == model if model is not None else server.model_alias
|
||||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||||
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
|||||||
last_cmpl_id = None
|
last_cmpl_id = None
|
||||||
for data in res:
|
for data in res:
|
||||||
choice = data["choices"][0]
|
choice = data["choices"][0]
|
||||||
|
assert data["system_fingerprint"].startswith("b")
|
||||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||||
if last_cmpl_id is None:
|
if last_cmpl_id is None:
|
||||||
last_cmpl_id = data["id"]
|
last_cmpl_id = data["id"]
|
||||||
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
|
|||||||
seed=42,
|
seed=42,
|
||||||
temperature=0.8,
|
temperature=0.8,
|
||||||
)
|
)
|
||||||
|
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
|
||||||
assert res.choices[0].finish_reason == "length"
|
assert res.choices[0].finish_reason == "length"
|
||||||
assert res.choices[0].message.content is not None
|
assert res.choices[0].message.content is not None
|
||||||
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
||||||
|
@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
|
|||||||
res = server.make_request("POST", "/completion", data={
|
res = server.make_request("POST", "/completion", data={
|
||||||
"prompt": "I believe the meaning of life is",
|
"prompt": "I believe the meaning of life is",
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 1.0,
|
"temperature": 0.0,
|
||||||
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||||
})
|
})
|
||||||
if last_res is not None:
|
if last_res is not None:
|
||||||
@ -120,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
|
|||||||
assert res.body["content"] != last_res.body["content"]
|
assert res.body["content"] != last_res.body["content"]
|
||||||
last_res = res
|
last_res = res
|
||||||
|
|
||||||
|
# TODO figure why it don't work with temperature = 1
|
||||||
|
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
|
||||||
@pytest.mark.parametrize("n_batch", [16, 32])
|
@pytest.mark.parametrize("n_batch", [16, 32])
|
||||||
@pytest.mark.parametrize("temperature", [0.0, 1.0])
|
@pytest.mark.parametrize("temperature", [0.0])
|
||||||
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
|
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
|
||||||
global server
|
global server
|
||||||
server.n_batch = n_batch
|
server.n_batch = n_batch
|
||||||
@ -257,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
|
|||||||
# assert match_regex(re_content, res.body["content"])
|
# assert match_regex(re_content, res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"prompt,n_predict,response_fields",
|
||||||
|
[
|
||||||
|
("I believe the meaning of life is", 8, []),
|
||||||
|
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_completion_response_fields(
|
||||||
|
prompt: str, n_predict: int, response_fields: list[str]
|
||||||
|
):
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
res = server.make_request(
|
||||||
|
"POST",
|
||||||
|
"/completion",
|
||||||
|
data={
|
||||||
|
"n_predict": n_predict,
|
||||||
|
"prompt": prompt,
|
||||||
|
"response_fields": response_fields,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert "content" in res.body
|
||||||
|
assert len(res.body["content"])
|
||||||
|
if len(response_fields):
|
||||||
|
assert res.body["generation_settings/n_predict"] == n_predict
|
||||||
|
assert res.body["prompt"] == "<s> " + prompt
|
||||||
|
assert isinstance(res.body["content"], str)
|
||||||
|
assert len(res.body) == len(response_fields)
|
||||||
|
else:
|
||||||
|
assert len(res.body)
|
||||||
|
assert "generation_settings" in res.body
|
||||||
|
|
||||||
|
|
||||||
def test_n_probs():
|
def test_n_probs():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import base64
|
||||||
|
import struct
|
||||||
import pytest
|
import pytest
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from utils import *
|
from utils import *
|
||||||
@ -194,3 +196,42 @@ def test_embedding_usage_multiple():
|
|||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
||||||
assert res.body['usage']['prompt_tokens'] == 2 * 9
|
assert res.body['usage']['prompt_tokens'] == 2 * 9
|
||||||
|
|
||||||
|
|
||||||
|
def test_embedding_openai_library_base64():
|
||||||
|
server.start()
|
||||||
|
test_input = "Test base64 embedding output"
|
||||||
|
|
||||||
|
# get embedding in default format
|
||||||
|
res = server.make_request("POST", "/v1/embeddings", data={
|
||||||
|
"input": test_input
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
vec0 = res.body["data"][0]["embedding"]
|
||||||
|
|
||||||
|
# get embedding in base64 format
|
||||||
|
res = server.make_request("POST", "/v1/embeddings", data={
|
||||||
|
"input": test_input,
|
||||||
|
"encoding_format": "base64"
|
||||||
|
})
|
||||||
|
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert "data" in res.body
|
||||||
|
assert len(res.body["data"]) == 1
|
||||||
|
|
||||||
|
embedding_data = res.body["data"][0]
|
||||||
|
assert "embedding" in embedding_data
|
||||||
|
assert isinstance(embedding_data["embedding"], str)
|
||||||
|
|
||||||
|
# Verify embedding is valid base64
|
||||||
|
decoded = base64.b64decode(embedding_data["embedding"])
|
||||||
|
# Verify decoded data can be converted back to float array
|
||||||
|
float_count = len(decoded) // 4 # 4 bytes per float
|
||||||
|
floats = struct.unpack(f'{float_count}f', decoded)
|
||||||
|
assert len(floats) > 0
|
||||||
|
assert all(isinstance(x, float) for x in floats)
|
||||||
|
assert len(floats) == len(vec0)
|
||||||
|
|
||||||
|
# make sure the decoded data is the same as the original
|
||||||
|
for x, y in zip(floats, vec0):
|
||||||
|
assert abs(x - y) < EPSILON
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "common/base64.hpp"
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
@ -56,6 +57,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||||
|
|
||||||
//
|
//
|
||||||
// tokenizer and input processing utils
|
// tokenizer and input processing utils
|
||||||
//
|
//
|
||||||
@ -88,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get value by path(key1 / key2)
|
||||||
|
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
|
||||||
|
json result = json::object();
|
||||||
|
|
||||||
|
for (const std::string & path : paths) {
|
||||||
|
json current = js;
|
||||||
|
const auto keys = string_split<std::string>(path, /*separator*/ '/');
|
||||||
|
bool valid_path = true;
|
||||||
|
for (const std::string & k : keys) {
|
||||||
|
if (valid_path && current.is_object() && current.contains(k)) {
|
||||||
|
current = current[k];
|
||||||
|
} else {
|
||||||
|
valid_path = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (valid_path) {
|
||||||
|
result[path] = current;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this handles 2 cases:
|
* this handles 2 cases:
|
||||||
* - only string, example: "string"
|
* - only string, example: "string"
|
||||||
@ -589,16 +614,31 @@ static json oaicompat_completion_params_parse(
|
|||||||
return llama_params;
|
return llama_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
|
||||||
json data = json::array();
|
json data = json::array();
|
||||||
int32_t n_tokens = 0;
|
int32_t n_tokens = 0;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (const auto & elem : embeddings) {
|
for (const auto & elem : embeddings) {
|
||||||
data.push_back(json{
|
json embedding_obj;
|
||||||
|
|
||||||
|
if (use_base64) {
|
||||||
|
const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
|
||||||
|
const char* data_ptr = reinterpret_cast<const char*>(vec.data());
|
||||||
|
size_t data_size = vec.size() * sizeof(float);
|
||||||
|
embedding_obj = {
|
||||||
|
{"embedding", base64::encode(data_ptr, data_size)},
|
||||||
|
{"index", i++},
|
||||||
|
{"object", "embedding"},
|
||||||
|
{"encoding_format", "base64"}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
embedding_obj = {
|
||||||
{"embedding", json_value(elem, "embedding", json::array())},
|
{"embedding", json_value(elem, "embedding", json::array())},
|
||||||
{"index", i++},
|
{"index", i++},
|
||||||
{"object", "embedding"}
|
{"object", "embedding"}
|
||||||
});
|
};
|
||||||
|
}
|
||||||
|
data.push_back(embedding_obj);
|
||||||
|
|
||||||
n_tokens += json_value(elem, "tokens_evaluated", 0);
|
n_tokens += json_value(elem, "tokens_evaluated", 0);
|
||||||
}
|
}
|
||||||
|
@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
|
|||||||
# write the shared library to the output directory
|
# write the shared library to the output directory
|
||||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||||
|
add_dependencies(ggml ${backend})
|
||||||
else()
|
else()
|
||||||
add_library(${backend} ${ARGN})
|
add_library(${backend} ${ARGN})
|
||||||
target_link_libraries(ggml PUBLIC ${backend})
|
target_link_libraries(ggml PUBLIC ${backend})
|
||||||
|
@ -66,6 +66,26 @@
|
|||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic push
|
||||||
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.from_bytes(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.to_bytes(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
||||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||||
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
|||||||
return handle;
|
return handle;
|
||||||
}
|
}
|
||||||
|
|
||||||
static dl_handle * dl_load_library(const std::string & path) {
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
|
||||||
return dl_load_library(converter.from_bytes(path));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||||
@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void * dl_load_library(const std::string & path) {
|
static void * dl_load_library(const std::wstring & path) {
|
||||||
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||||
|
|
||||||
return handle;
|
return handle;
|
||||||
}
|
}
|
||||||
@ -202,11 +217,11 @@ struct ggml_backend_registry {
|
|||||||
devices.push_back(device);
|
devices.push_back(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||||
dl_handle_ptr handle { dl_load_library(path) };
|
dl_handle_ptr handle { dl_load_library(path) };
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -214,7 +229,7 @@ struct ggml_backend_registry {
|
|||||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
if (score_fn && score_fn() == 0) {
|
if (score_fn && score_fn() == 0) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -222,7 +237,7 @@ struct ggml_backend_registry {
|
|||||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||||
if (!backend_init_fn) {
|
if (!backend_init_fn) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -231,16 +246,16 @@ struct ggml_backend_registry {
|
|||||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
if (!reg) {
|
if (!reg) {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
} else {
|
} else {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||||
|
|
||||||
register_backend(reg, std::move(handle));
|
register_backend(reg, std::move(handle));
|
||||||
|
|
||||||
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|||||||
|
|
||||||
// Dynamic loading
|
// Dynamic loading
|
||||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||||
return get_reg().load_backend(path, false);
|
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||||
get_reg().unload_backend(reg, true);
|
get_reg().unload_backend(reg, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_executable_path() {
|
static std::wstring get_executable_path() {
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
// get executable path
|
// get executable path
|
||||||
std::vector<char> path;
|
std::vector<char> path;
|
||||||
@ -401,13 +416,17 @@ static std::string get_executable_path() {
|
|||||||
if (last_slash != std::string::npos) {
|
if (last_slash != std::string::npos) {
|
||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
return base_path + "/";
|
return utf8_to_utf16(base_path + "/");
|
||||||
#elif defined(__linux__)
|
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||||
std::string base_path = ".";
|
std::string base_path = ".";
|
||||||
std::vector<char> path(1024);
|
std::vector<char> path(1024);
|
||||||
while (true) {
|
while (true) {
|
||||||
// get executable path
|
// get executable path
|
||||||
|
# if defined(__linux__)
|
||||||
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
||||||
|
# elif defined(__FreeBSD__)
|
||||||
|
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
||||||
|
# endif
|
||||||
if (len == -1) {
|
if (len == -1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -423,57 +442,63 @@ static std::string get_executable_path() {
|
|||||||
path.resize(path.size() * 2);
|
path.resize(path.size() * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
return base_path + "/";
|
return utf8_to_utf16(base_path + "/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
std::vector<char> path(MAX_PATH);
|
std::vector<wchar_t> path(MAX_PATH);
|
||||||
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
return "";
|
return {};
|
||||||
}
|
}
|
||||||
std::string base_path(path.data(), len);
|
std::wstring base_path(path.data(), len);
|
||||||
// remove executable name
|
// remove executable name
|
||||||
auto last_slash = base_path.find_last_of('\\');
|
auto last_slash = base_path.find_last_of('\\');
|
||||||
if (last_slash != std::string::npos) {
|
if (last_slash != std::string::npos) {
|
||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
return base_path + "\\";
|
return base_path + L"\\";
|
||||||
|
#else
|
||||||
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string backend_filename_prefix() {
|
static std::wstring backend_filename_prefix() {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return "ggml-";
|
return L"ggml-";
|
||||||
#else
|
#else
|
||||||
return "libggml-";
|
return L"libggml-";
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string backend_filename_suffix() {
|
static std::wstring backend_filename_suffix() {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return ".dll";
|
return L".dll";
|
||||||
#else
|
#else
|
||||||
return ".so";
|
return L".so";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::wstring path_separator() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
return L"\\";
|
||||||
|
#else
|
||||||
|
return L"/";
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||||
// TODO: search system paths
|
// TODO: search system paths
|
||||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||||
std::vector<std::string> search_paths;
|
std::vector<std::wstring> search_paths;
|
||||||
if (user_search_path == nullptr) {
|
if (user_search_path == nullptr) {
|
||||||
search_paths.push_back("./");
|
search_paths.push_back(L"." + path_separator());
|
||||||
search_paths.push_back(get_executable_path());
|
search_paths.push_back(get_executable_path());
|
||||||
} else {
|
} else {
|
||||||
#if defined(_WIN32)
|
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||||
search_paths.push_back(std::string(user_search_path) + "\\");
|
|
||||||
#else
|
|
||||||
search_paths.push_back(std::string(user_search_path) + "/");
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int best_score = 0;
|
int best_score = 0;
|
||||||
std::string best_path;
|
std::wstring best_path;
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|||||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||||
for (const auto & entry : dir_it) {
|
for (const auto & entry : dir_it) {
|
||||||
if (entry.is_regular_file()) {
|
if (entry.is_regular_file()) {
|
||||||
std::string filename = entry.path().filename().string();
|
std::wstring filename = entry.path().filename().wstring();
|
||||||
std::string ext = entry.path().extension().string();
|
std::wstring ext = entry.path().extension().wstring();
|
||||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||||
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||||
if (!handle && !silent) {
|
if (!handle && !silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||||
}
|
}
|
||||||
if (handle) {
|
if (handle) {
|
||||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
if (score_fn) {
|
if (score_fn) {
|
||||||
int s = score_fn();
|
int s = score_fn();
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||||
#endif
|
#endif
|
||||||
if (s > best_score) {
|
if (s > best_score) {
|
||||||
best_score = s;
|
best_score = s;
|
||||||
best_path = entry.path().string();
|
best_path = entry.path().wstring();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|||||||
if (best_score == 0) {
|
if (best_score == 0) {
|
||||||
// try to load the base backend
|
// try to load the base backend
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||||
if (fs::exists(path)) {
|
if (fs::exists(path)) {
|
||||||
return get_reg().load_backend(path.c_str(), silent);
|
return get_reg().load_backend(path, silent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_reg().load_backend(best_path.c_str(), silent);
|
return get_reg().load_backend(best_path, silent);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_load_all() {
|
void ggml_backend_load_all() {
|
||||||
|
@ -135,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# show enabled features
|
# show enabled features
|
||||||
|
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
|
||||||
|
set(FEAT_INPUT_FILE "NUL")
|
||||||
|
else()
|
||||||
|
set(FEAT_INPUT_FILE "/dev/null")
|
||||||
|
endif()
|
||||||
|
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
||||||
INPUT_FILE "/dev/null"
|
INPUT_FILE ${FEAT_INPUT_FILE}
|
||||||
OUTPUT_VARIABLE ARM_FEATURE
|
OUTPUT_VARIABLE ARM_FEATURE
|
||||||
RESULT_VARIABLE ARM_FEATURE_RESULT
|
RESULT_VARIABLE ARM_FEATURE_RESULT
|
||||||
)
|
)
|
||||||
if (ARM_FEATURE_RESULT)
|
if (ARM_FEATURE_RESULT)
|
||||||
message(FATAL_ERROR "Failed to get ARM features")
|
message(WARNING "Failed to get ARM features")
|
||||||
else()
|
else()
|
||||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
||||||
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
||||||
@ -317,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||||
|
|
||||||
if (GGML_BACKEND_DL)
|
if (GGML_BACKEND_DL)
|
||||||
|
if (GGML_NATIVE)
|
||||||
|
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||||
|
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||||
|
endif()
|
||||||
|
|
||||||
# The feature detection code is compiled as a separate target so that
|
# The feature detection code is compiled as a separate target so that
|
||||||
# it can be built without the architecture flags
|
# it can be built without the architecture flags
|
||||||
# Since multiple variants of the CPU backend may be included in the same
|
# Since multiple variants of the CPU backend may be included in the same
|
||||||
|
@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|||||||
#define GGML_F16_STEP 32
|
#define GGML_F16_STEP 32
|
||||||
#define GGML_F16_EPR 4
|
#define GGML_F16_EPR 4
|
||||||
|
|
||||||
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
||||||
float tmp[4];
|
float tmp[4];
|
||||||
|
|
||||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||||
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
|||||||
return _mm_loadu_ps(tmp);
|
return _mm_loadu_ps(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
float arr[4];
|
float arr[4];
|
||||||
|
|
||||||
_mm_storeu_ps(arr, y);
|
_mm_storeu_ps(arr, y);
|
||||||
@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
if (src1_cont) {
|
if (src1_cont) {
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(params,
|
||||||
|
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
nb01/ggml_type_size(src0->type),
|
nb01/ggml_type_size(src0->type),
|
||||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||||
nb11/ggml_type_size(src1->type),
|
nb11/ggml_type_size(src1->type),
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
nb1/ggml_type_size(dst->type),
|
nb1/ggml_type_size(dst->type),
|
||||||
ith, nth,
|
|
||||||
src0->type,
|
src0->type,
|
||||||
src1->type,
|
src1->type,
|
||||||
dst->type))
|
dst->type))
|
||||||
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
|
|||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(params,
|
||||||
|
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
nb01/ggml_type_size(src0->type),
|
nb01/ggml_type_size(src0->type),
|
||||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||||
row_size/ggml_type_size(vec_dot_type),
|
row_size/ggml_type_size(vec_dot_type),
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
nb1/ggml_type_size(dst->type),
|
nb1/ggml_type_size(dst->type),
|
||||||
ith, nth,
|
|
||||||
src0->type,
|
src0->type,
|
||||||
vec_dot_type,
|
vec_dot_type,
|
||||||
dst->type))
|
dst->type))
|
||||||
|
@ -53,6 +53,8 @@
|
|||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define NOINLINE __declspec(noinline)
|
#define NOINLINE __declspec(noinline)
|
||||||
#else
|
#else
|
||||||
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
|
|||||||
return _mm512_fmadd_ps(a, b, c);
|
return _mm512_fmadd_ps(a, b, c);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(__AVX512BF16__)
|
||||||
|
template <>
|
||||||
|
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
|
||||||
|
return _mm512_dpbf16_ps(c, a, b);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
|
||||||
|
return _mm256_dpbf16_ps(c, a, b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_FMA)
|
#if defined(__ARM_FEATURE_FMA)
|
||||||
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
|
|||||||
}
|
}
|
||||||
#endif // __AVX__
|
#endif // __AVX__
|
||||||
|
|
||||||
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||||
|
template <> inline __m256 load(const ggml_bf16_t *p) {
|
||||||
|
return _mm256_castsi256_ps(
|
||||||
|
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
|
||||||
|
}
|
||||||
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__F16C__)
|
#if defined(__F16C__)
|
||||||
template <> inline __m256 load(const ggml_fp16_t *p) {
|
template <> inline __m256 load(const ggml_fp16_t *p) {
|
||||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
|
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
|
||||||
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
|
|||||||
template <> inline __m512 load(const ggml_fp16_t *p) {
|
template <> inline __m512 load(const ggml_fp16_t *p) {
|
||||||
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
|
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
|
||||||
}
|
}
|
||||||
|
template <> inline __m512 load(const ggml_bf16_t *p) {
|
||||||
|
return _mm512_castsi512_ps(
|
||||||
|
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
|
||||||
|
}
|
||||||
#endif // __AVX512F__
|
#endif // __AVX512F__
|
||||||
|
|
||||||
|
#if defined(__AVX512BF16__)
|
||||||
|
template <> inline __m512bh load(const ggml_bf16_t *p) {
|
||||||
|
return (__m512bh)_mm512_loadu_ps((const float *)p);
|
||||||
|
}
|
||||||
|
template <> inline __m256bh load(const ggml_bf16_t *p) {
|
||||||
|
return (__m256bh)_mm256_loadu_ps((const float *)p);
|
||||||
|
}
|
||||||
|
template <> inline __m512bh load(const float *p) {
|
||||||
|
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
|
||||||
|
}
|
||||||
|
template <> inline __m256bh load(const float *p) {
|
||||||
|
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// CONSTANTS
|
// CONSTANTS
|
||||||
|
|
||||||
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// FLOATING POINT MATRIX MULTIPLICATION
|
// FLOATING POINT MATRIX MULTIPLICATION
|
||||||
|
|
||||||
|
template <int M>
|
||||||
|
static inline int64_t BLOCK_SIZE(size_t m) {
|
||||||
|
const int64_t NB_BLOC_M = (m + M - 1) / M;
|
||||||
|
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
|
||||||
|
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
|
||||||
|
}
|
||||||
|
|
||||||
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
|
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
|
||||||
class tinyBLAS {
|
class tinyBLAS {
|
||||||
public:
|
public:
|
||||||
tinyBLAS(int64_t k,
|
tinyBLAS(const ggml_compute_params * params, int64_t k,
|
||||||
const TA *A, int64_t lda,
|
const TA *A, int64_t lda,
|
||||||
const TB *B, int64_t ldb,
|
const TB *B, int64_t ldb,
|
||||||
TC *C, int64_t ldc,
|
TC *C, int64_t ldc)
|
||||||
int ith, int nth)
|
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n) {
|
bool matmul(int64_t m, int64_t n) {
|
||||||
mnpack(0, m, 0, n);
|
if (k % KN != 0)
|
||||||
|
return false;
|
||||||
|
// compute RM for only need tile with size RM&RM-1
|
||||||
|
#if VECTOR_REGISTERS == 32
|
||||||
|
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||||
|
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (m % 8 == 0 ) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||||
|
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (m % 4 == 0) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||||
|
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#else // VECTOR_REGISTERS == 16
|
||||||
|
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||||
|
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (m % 8 == 0 ) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||||
|
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (m % 4 == 0) {
|
||||||
|
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||||
|
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
template <int RM, int RN, int BM>
|
||||||
int64_t mc, nc, mp, np;
|
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
|
||||||
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
|
if (SIZE_N == RN) {
|
||||||
#if VECTOR_REGISTERS == 32
|
return gemm<RM, RN, BM>(m, n, BN);
|
||||||
case 0x55:
|
}
|
||||||
mc = 5;
|
if constexpr (RN > 1) {
|
||||||
nc = 5;
|
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||||
gemm<5, 5>(m0, m, n0, n);
|
} else {
|
||||||
break;
|
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||||
case 0x45:
|
GGML_ASSERT(false); // we have miss something.
|
||||||
mc = 4;
|
|
||||||
nc = 5;
|
|
||||||
gemm<4, 5>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x54:
|
|
||||||
mc = 5;
|
|
||||||
nc = 4;
|
|
||||||
gemm<5, 4>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x44:
|
|
||||||
mc = 4;
|
|
||||||
nc = 4;
|
|
||||||
gemm<4, 4>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x53:
|
|
||||||
mc = 5;
|
|
||||||
nc = 3;
|
|
||||||
gemm<5, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x35:
|
|
||||||
mc = 3;
|
|
||||||
nc = 5;
|
|
||||||
gemm<3, 5>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x43:
|
|
||||||
mc = 4;
|
|
||||||
nc = 3;
|
|
||||||
gemm<4, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
#else
|
|
||||||
case 0x55:
|
|
||||||
case 0x54:
|
|
||||||
case 0x53:
|
|
||||||
case 0x45:
|
|
||||||
case 0x44:
|
|
||||||
case 0x43:
|
|
||||||
mc = 4;
|
|
||||||
nc = 3;
|
|
||||||
gemm<4, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x35:
|
|
||||||
#endif
|
|
||||||
case 0x34:
|
|
||||||
mc = 3;
|
|
||||||
nc = 4;
|
|
||||||
gemm<3, 4>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x52:
|
|
||||||
mc = 5;
|
|
||||||
nc = 2;
|
|
||||||
gemm<5, 2>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x33:
|
|
||||||
mc = 3;
|
|
||||||
nc = 3;
|
|
||||||
gemm<3, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x25:
|
|
||||||
mc = 2;
|
|
||||||
nc = 5;
|
|
||||||
gemm<2, 5>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x42:
|
|
||||||
mc = 4;
|
|
||||||
nc = 2;
|
|
||||||
gemm<4, 2>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x24:
|
|
||||||
mc = 2;
|
|
||||||
nc = 4;
|
|
||||||
gemm<2, 4>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x32:
|
|
||||||
mc = 3;
|
|
||||||
nc = 2;
|
|
||||||
gemm<3, 2>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x23:
|
|
||||||
mc = 2;
|
|
||||||
nc = 3;
|
|
||||||
gemm<2, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x51:
|
|
||||||
mc = 5;
|
|
||||||
nc = 1;
|
|
||||||
gemm<5, 1>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x41:
|
|
||||||
mc = 4;
|
|
||||||
nc = 1;
|
|
||||||
gemm<4, 1>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x22:
|
|
||||||
mc = 2;
|
|
||||||
nc = 2;
|
|
||||||
gemm<2, 2>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x15:
|
|
||||||
mc = 1;
|
|
||||||
nc = 5;
|
|
||||||
gemm<1, 5>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x14:
|
|
||||||
mc = 1;
|
|
||||||
nc = 4;
|
|
||||||
gemm<1, 4>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x31:
|
|
||||||
mc = 3;
|
|
||||||
nc = 1;
|
|
||||||
gemm<3, 1>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x13:
|
|
||||||
mc = 1;
|
|
||||||
nc = 3;
|
|
||||||
gemm<1, 3>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x21:
|
|
||||||
mc = 2;
|
|
||||||
nc = 1;
|
|
||||||
gemm<2, 1>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x12:
|
|
||||||
mc = 1;
|
|
||||||
nc = 2;
|
|
||||||
gemm<1, 2>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
case 0x11:
|
|
||||||
mc = 1;
|
|
||||||
nc = 1;
|
|
||||||
gemm<1, 1>(m0, m, n0, n);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
mp = m0 + (m - m0) / mc * mc;
|
|
||||||
np = n0 + (n - n0) / nc * nc;
|
|
||||||
mnpack(mp, m, n0, np);
|
|
||||||
mnpack(m0, m, np, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int RM, int RN>
|
template <int RM, int RN>
|
||||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
inline void gemm_bloc(int64_t ii, int64_t jj) {
|
||||||
int64_t ytiles = (m - m0) / RM;
|
|
||||||
int64_t xtiles = (n - n0) / RN;
|
|
||||||
int64_t tiles = xtiles * ytiles;
|
|
||||||
int64_t duty = (tiles + nth - 1) / nth;
|
|
||||||
int64_t start = duty * ith;
|
|
||||||
int64_t end = start + duty;
|
|
||||||
if (end > tiles)
|
|
||||||
end = tiles;
|
|
||||||
for (int64_t job = start; job < end; ++job) {
|
|
||||||
int64_t ii = m0 + job / xtiles * RM;
|
|
||||||
int64_t jj = n0 + job % xtiles * RN;
|
|
||||||
D Cv[RN][RM] = {};
|
D Cv[RN][RM] = {};
|
||||||
for (int64_t l = 0; l < k; l += KN)
|
for (int64_t l = 0; l < k; l += KN) {
|
||||||
for (int64_t j = 0; j < RN; ++j)
|
// help compiler for op order.
|
||||||
for (int64_t i = 0; i < RM; ++i)
|
if constexpr (RM <= RN) {
|
||||||
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
|
V Av[RM];
|
||||||
load<V>(B + ldb * (jj + j) + l),
|
for (int64_t i = 0; i < RM; ++i) {
|
||||||
Cv[j][i]);
|
Av[i] = load<V>(A + lda * (ii + i) + l);
|
||||||
|
}
|
||||||
|
for (int64_t j = 0; j < RN; ++j) {
|
||||||
|
V Bv = load<V>(B + ldb * (jj + j) + l);
|
||||||
|
for (int64_t i = 0; i < RM; ++i) {
|
||||||
|
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
V Bv[RN];
|
||||||
|
for (int64_t j = 0; j < RN; ++j) {
|
||||||
|
Bv[j] = load<V>(B + ldb * (jj + j) + l);
|
||||||
|
}
|
||||||
|
for (int64_t i = 0; i < RM; ++i) {
|
||||||
|
V Av = load<V>(A + lda * (ii + i) + l);
|
||||||
|
for (int64_t j = 0; j < RN; ++j) {
|
||||||
|
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
for (int64_t j = 0; j < RN; ++j)
|
for (int64_t j = 0; j < RN; ++j)
|
||||||
for (int64_t i = 0; i < RM; ++i)
|
for (int64_t i = 0; i < RM; ++i)
|
||||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int RM, int RN, int BM>
|
||||||
|
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||||
|
static std::atomic<int64_t> current_chunk;
|
||||||
|
|
||||||
|
GGML_ASSERT(m % (RM * BM) == 0);
|
||||||
|
const int64_t ytiles = m / (RM * BM);
|
||||||
|
const int64_t xtiles = (n + RN -1) / RN;
|
||||||
|
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
|
||||||
|
|
||||||
|
// "round" bloc_size to "nearest" BN
|
||||||
|
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
|
||||||
|
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
|
||||||
|
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
|
||||||
|
const int64_t nb_job = ytiles * NB_BN;
|
||||||
|
|
||||||
|
if (params->ith == 0) {
|
||||||
|
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||||
|
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||||
|
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
|
int64_t job = params->ith;
|
||||||
|
while (job < nb_job) {
|
||||||
|
const int64_t ii = (job % ytiles) * RM * BM;
|
||||||
|
const int64_t jb = job / ytiles;
|
||||||
|
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||||
|
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
|
||||||
|
|
||||||
|
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
|
||||||
|
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
|
||||||
|
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
|
||||||
|
|
||||||
|
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
|
||||||
|
int64_t jj = jj0;
|
||||||
|
for (; jj < jj1; jj += RN) {
|
||||||
|
gemm_bloc<RM, RN>(ii + bi, jj);
|
||||||
|
}
|
||||||
|
if constexpr (RN > 1) {
|
||||||
|
for (; jj < jj2; jj += RN - 1) {
|
||||||
|
gemm_bloc<RM, RN-1>(ii + bi, jj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(jj == jj2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// next step.
|
||||||
|
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ggml_compute_params * params;
|
||||||
const TA *const A;
|
const TA *const A;
|
||||||
const TB *const B;
|
const TB *const B;
|
||||||
TC *const C;
|
TC *const C;
|
||||||
@ -452,8 +461,6 @@ class tinyBLAS {
|
|||||||
const int64_t lda;
|
const int64_t lda;
|
||||||
const int64_t ldb;
|
const int64_t ldb;
|
||||||
const int64_t ldc;
|
const int64_t ldc;
|
||||||
const int ith;
|
|
||||||
const int nth;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -1657,8 +1664,9 @@ class tinyBLAS_PPC {
|
|||||||
* @param Ctype is GGML data type of `C`
|
* @param Ctype is GGML data type of `C`
|
||||||
* @return true if this function was able to service the matmul request
|
* @return true if this function was able to service the matmul request
|
||||||
*/
|
*/
|
||||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
||||||
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||||
|
int64_t ldc, int Atype, int Btype, int Ctype) {
|
||||||
|
|
||||||
assert(m >= 0);
|
assert(m >= 0);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
@ -1666,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
assert(lda >= k);
|
assert(lda >= k);
|
||||||
assert(ldb >= k);
|
assert(ldb >= k);
|
||||||
assert(ldc >= m);
|
assert(ldc >= m);
|
||||||
assert(nth > 0);
|
assert(params->nth > 0);
|
||||||
assert(ith < nth);
|
assert(params->ith < params->nth);
|
||||||
|
|
||||||
// only enable sgemm for prompt processing
|
// only enable sgemm for prompt processing
|
||||||
if (n < 2)
|
if (n < 2)
|
||||||
@ -1682,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
if (Btype != GGML_TYPE_F32)
|
if (Btype != GGML_TYPE_F32)
|
||||||
return false;
|
return false;
|
||||||
#if defined(__AVX512F__)
|
#if defined(__AVX512F__)
|
||||||
if (k % 16)
|
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
|
||||||
return false;
|
|
||||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{
|
|
||||||
k, (const float *)A, lda,
|
k, (const float *)A, lda,
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc};
|
||||||
ith, nth};
|
return tb.matmul(m, n);
|
||||||
tb.matmul(m, n);
|
|
||||||
return true;
|
|
||||||
#elif defined(__AVX__) || defined(__AVX2__)
|
#elif defined(__AVX__) || defined(__AVX2__)
|
||||||
if (k % 8)
|
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
|
||||||
return false;
|
|
||||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{
|
|
||||||
k, (const float *)A, lda,
|
k, (const float *)A, lda,
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc};
|
||||||
ith, nth};
|
return tb.matmul(m, n);
|
||||||
tb.matmul(m, n);
|
|
||||||
return true;
|
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
return false;
|
return false;
|
||||||
if (k % 4)
|
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
||||||
return false;
|
|
||||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
|
|
||||||
k, (const float *)A, lda,
|
k, (const float *)A, lda,
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc};
|
||||||
ith, nth};
|
return tb.matmul(m, n);
|
||||||
tb.matmul(m, n);
|
|
||||||
return true;
|
|
||||||
#elif defined(__MMA__)
|
#elif defined(__MMA__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
return false;
|
return false;
|
||||||
@ -1720,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const float *)A, lda,
|
k, (const float *)A, lda,
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -1728,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case GGML_TYPE_BF16: {
|
||||||
|
#if defined(__AVX512BF16__)
|
||||||
|
if (Btype == GGML_TYPE_BF16) {
|
||||||
|
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||||
|
(const ggml_bf16_t *)A, lda,
|
||||||
|
(const ggml_bf16_t *)B, ldb,
|
||||||
|
(float *)C, ldc};
|
||||||
|
return tb.matmul(m, n);
|
||||||
|
}
|
||||||
|
#elif defined(__AVX512F__)
|
||||||
|
if (Btype == GGML_TYPE_BF16) {
|
||||||
|
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||||
|
(const ggml_bf16_t *)A, lda,
|
||||||
|
(const ggml_bf16_t *)B, ldb,
|
||||||
|
(float *)C, ldc};
|
||||||
|
return tb.matmul(m, n);
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
if (Btype == GGML_TYPE_BF16) {
|
||||||
|
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||||
|
(const ggml_bf16_t *)A, lda,
|
||||||
|
(const ggml_bf16_t *)B, ldb,
|
||||||
|
(float *)C, ldc};
|
||||||
|
return tb.matmul(m, n);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
case GGML_TYPE_F16: {
|
case GGML_TYPE_F16: {
|
||||||
#if defined(__AVX512F__)
|
#if defined(__AVX512F__)
|
||||||
if (k % 16)
|
if (Btype == GGML_TYPE_F16) {
|
||||||
return false;
|
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||||
if (Btype != GGML_TYPE_F32)
|
(const ggml_fp16_t *)A, lda,
|
||||||
return false;
|
(const ggml_fp16_t *)B, ldb,
|
||||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
|
(float *)C, ldc};
|
||||||
k, (const ggml_fp16_t *)A, lda,
|
return tb.matmul(m, n);
|
||||||
(const float *)B, ldb,
|
}
|
||||||
(float *)C, ldc,
|
|
||||||
ith, nth};
|
|
||||||
tb.matmul(m, n);
|
|
||||||
return true;
|
|
||||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||||
if (k % 8)
|
if (Btype == GGML_TYPE_F16) {
|
||||||
return false;
|
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||||
if (Btype != GGML_TYPE_F32)
|
(const ggml_fp16_t *)A, lda,
|
||||||
return false;
|
(const ggml_fp16_t *)B, ldb,
|
||||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
|
(float *)C, ldc};
|
||||||
k, (const ggml_fp16_t *)A, lda,
|
return tb.matmul(m, n);
|
||||||
(const float *)B, ldb,
|
}
|
||||||
(float *)C, ldc,
|
|
||||||
ith, nth};
|
|
||||||
tb.matmul(m, n);
|
|
||||||
return true;
|
|
||||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||||
if (n < 8)
|
if (n < 8)
|
||||||
return false;
|
return false;
|
||||||
if (k % 8)
|
if (Btype == GGML_TYPE_F16) {
|
||||||
return false;
|
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||||
if (Btype != GGML_TYPE_F16)
|
|
||||||
return false;
|
|
||||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
|
|
||||||
k, (const ggml_fp16_t *)A, lda,
|
k, (const ggml_fp16_t *)A, lda,
|
||||||
(const ggml_fp16_t *)B, ldb,
|
(const ggml_fp16_t *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc};
|
||||||
ith, nth};
|
return tb.matmul(m, n);
|
||||||
tb.matmul(m, n);
|
}
|
||||||
return true;
|
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
if (k % 4)
|
if (Btype == GGML_TYPE_F32) {
|
||||||
return false;
|
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
|
||||||
if (Btype != GGML_TYPE_F32)
|
|
||||||
return false;
|
|
||||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
|
|
||||||
k, (const ggml_fp16_t *)A, lda,
|
k, (const ggml_fp16_t *)A, lda,
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc};
|
||||||
ith, nth};
|
return tb.matmul(m, n);
|
||||||
tb.matmul(m, n);
|
}
|
||||||
return true;
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
#endif
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
case GGML_TYPE_Q8_0: {
|
case GGML_TYPE_Q8_0: {
|
||||||
@ -1792,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_q8_0 *)A, lda,
|
k, (const block_q8_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
@ -1800,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_q8_0 *)A, lda,
|
k, (const block_q8_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -1816,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_q4_0 *)A, lda,
|
k, (const block_q4_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
@ -1824,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_q4_0 *)A, lda,
|
k, (const block_q4_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -1840,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_q5_0 *)A, lda,
|
k, (const block_q5_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -1856,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
k, (const block_iq4_nl *)A, lda,
|
k, (const block_iq4_nl *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
params->ith, params->nth};
|
||||||
tb.matmul(m, n);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
@ -1868,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void)params;
|
||||||
(void)m;
|
(void)m;
|
||||||
(void)n;
|
(void)n;
|
||||||
(void)k;
|
(void)k;
|
||||||
@ -1877,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||||||
(void)ldb;
|
(void)ldb;
|
||||||
(void)C;
|
(void)C;
|
||||||
(void)ldc;
|
(void)ldc;
|
||||||
(void)ith;
|
|
||||||
(void)nth;
|
|
||||||
(void)Atype;
|
(void)Atype;
|
||||||
(void)Btype;
|
(void)Btype;
|
||||||
(void)Ctype;
|
(void)Ctype;
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
||||||
const void *, int64_t, void *, int64_t, int, int,
|
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
||||||
int, int, int);
|
int, int, int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
|
|||||||
uint32_t ne;
|
uint32_t ne;
|
||||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||||
uint32_t d_offset;
|
uint32_t misalign_offsets;
|
||||||
float param1; float param2;
|
float param1; float param2;
|
||||||
uint32_t ne0_012mp; uint32_t ne0_012L;
|
uint32_t ne0_012mp; uint32_t ne0_012L;
|
||||||
uint32_t ne0_01mp; uint32_t ne0_01L;
|
uint32_t ne0_01mp; uint32_t ne0_01L;
|
||||||
@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
|
|||||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||||
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
||||||
uint32_t d_offset;
|
uint32_t misalign_offsets;
|
||||||
float param1; float param2; int32_t param3;
|
float param1; float param2; int32_t param3;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -546,7 +546,7 @@ struct vk_staging_memcpy {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct vk_op_upscale_push_constants {
|
struct vk_op_upscale_push_constants {
|
||||||
uint32_t ne; uint32_t d_offset;
|
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
||||||
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
||||||
float sf0; float sf1; float sf2; float sf3;
|
float sf0; float sf1; float sf2; float sf3;
|
||||||
@ -1404,10 +1404,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
// spec constants and tile sizes for non-quant matmul/matmul_id
|
// spec constants and tile sizes for non-quant matmul/matmul_id
|
||||||
l_warptile = { 256, 128, 256, 64 };
|
l_warptile = { 256, 128, 256, 64 };
|
||||||
m_warptile = { 256, 128, 128, 64 };
|
m_warptile = { 256, 128, 128, 64 };
|
||||||
s_warptile = { 128, 32, 16, 64 };
|
s_warptile = { 128, 64, 64, 64 };
|
||||||
l_wg_denoms = {128, 256, 1 };
|
l_wg_denoms = {128, 256, 1 };
|
||||||
m_wg_denoms = {128, 128, 1 };
|
m_wg_denoms = {128, 128, 1 };
|
||||||
s_wg_denoms = { 32, 16, 1 };
|
s_wg_denoms = { 64, 64, 1 };
|
||||||
|
|
||||||
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
||||||
l_warptile_mmq = { 256, 128, 256, 64 };
|
l_warptile_mmq = { 256, 128, 256, 64 };
|
||||||
@ -1855,53 +1855,58 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
|
|
||||||
// mul mat vec
|
// mul mat vec
|
||||||
|
|
||||||
// AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
|
// the number of rows computed per shader depends on GPU model and quant
|
||||||
uint32_t rm = 1;
|
uint32_t rm_stdq = 1;
|
||||||
if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
|
uint32_t rm_kq = 2;
|
||||||
rm = 2;
|
if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||||
|
if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
|
||||||
|
rm_stdq = 2;
|
||||||
|
rm_kq = 4;
|
||||||
|
}
|
||||||
|
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||||
|
rm_stdq = 2;
|
||||||
|
|
||||||
// computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
|
||||||
|
|
||||||
// dequant shaders
|
// dequant shaders
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||||
@ -2012,11 +2017,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||||
if (device->float_controls_rte_fp16) {
|
if (device->float_controls_rte_fp16) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||||
@ -3205,8 +3210,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
// Check if src is pinned memory
|
// Check if src is pinned memory
|
||||||
vk_buffer buf;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
||||||
|
|
||||||
const uint64_t ne0 = tensor->ne[0];
|
const uint64_t ne0 = tensor->ne[0];
|
||||||
@ -3269,7 +3274,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|||||||
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||||
|
|
||||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
@ -3302,7 +3307,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|||||||
}
|
}
|
||||||
// Check if src is pinned memory
|
// Check if src is pinned memory
|
||||||
vk_buffer buf = nullptr;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
||||||
|
|
||||||
if (buf != nullptr) {
|
if (buf != nullptr) {
|
||||||
@ -3344,7 +3349,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|||||||
copy_size};
|
copy_size};
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||||
|
|
||||||
if (width == spitch) {
|
if (width == spitch) {
|
||||||
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
||||||
@ -3400,7 +3405,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
|||||||
|
|
||||||
// Check if dst is pinned memory
|
// Check if dst is pinned memory
|
||||||
vk_buffer buf = nullptr;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
||||||
|
|
||||||
std::vector<vk::BufferCopy> slices(1);
|
std::vector<vk::BufferCopy> slices(1);
|
||||||
@ -3480,7 +3485,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
|||||||
|
|
||||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||||
|
|
||||||
vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
|
vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||||
@ -3732,9 +3737,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -3934,9 +3939,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4112,7 +4117,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src1_uma = false;
|
bool src1_uma = false;
|
||||||
@ -4300,11 +4305,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
vk_buffer d_ids;
|
vk_buffer d_ids = nullptr;
|
||||||
size_t ids_buf_offset = 0;
|
size_t ids_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4505,11 +4510,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
vk_buffer d_ids;
|
vk_buffer d_ids = nullptr;
|
||||||
size_t ids_buf_offset = 0;
|
size_t ids_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4768,8 +4773,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
|
||||||
vk_buffer d_Q, d_K, d_V, d_D, d_M;
|
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
|
||||||
uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset;
|
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
|
||||||
|
|
||||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
||||||
|
|
||||||
@ -5071,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
|
||||||
|
{
|
||||||
|
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||||
|
GGML_UNUSED(p);
|
||||||
|
GGML_UNUSED(src0);
|
||||||
|
GGML_UNUSED(src1);
|
||||||
|
GGML_UNUSED(src2);
|
||||||
|
GGML_UNUSED(dst);
|
||||||
|
static_assert(!std::is_const<T>::value, "unexpected type");
|
||||||
|
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
|
||||||
|
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
|
||||||
|
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
|
||||||
|
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||||
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||||
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||||
|
|
||||||
|
GGML_UNUSED(src1);
|
||||||
|
GGML_UNUSED(src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||||
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||||
|
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||||
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
|
||||||
|
|
||||||
|
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
||||||
|
|
||||||
|
GGML_UNUSED(src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||||
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||||
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
p.a_offset = a_offset;
|
||||||
|
p.d_offset = d_offset;
|
||||||
|
|
||||||
|
GGML_UNUSED(src1);
|
||||||
|
GGML_UNUSED(src2);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename PC>
|
template<typename PC>
|
||||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
||||||
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||||
@ -5174,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(d_D != nullptr);
|
GGML_ASSERT(d_D != nullptr);
|
||||||
uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||||
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
|
|
||||||
if(!src0_uma) {
|
if(!src0_uma) {
|
||||||
d_X = src0_buf_ctx->dev_buffer;
|
d_X = src0_buf_ctx->dev_buffer;
|
||||||
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
||||||
@ -5191,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||||||
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
||||||
GGML_ASSERT(d_Z != nullptr);
|
GGML_ASSERT(d_Z != nullptr);
|
||||||
}
|
}
|
||||||
|
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
|
||||||
|
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
|
||||||
|
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||||
|
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||||
|
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||||
|
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||||
|
|
||||||
if (op_supports_incontiguous) {
|
if (op_supports_incontiguous) {
|
||||||
x_sz = ggml_nbytes(src0);
|
x_sz = ggml_nbytes(src0);
|
||||||
@ -5378,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
||||||
|
|
||||||
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
||||||
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
||||||
@ -5390,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
d_offset,
|
0,
|
||||||
0.0f, 0.0f, offset,
|
0.0f, 0.0f, offset,
|
||||||
}, dryrun);
|
}, dryrun);
|
||||||
}
|
}
|
||||||
@ -5474,8 +5534,8 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
|
|||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
|
||||||
vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
|
vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
|
||||||
uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
|
size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
|
||||||
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
||||||
|
|
||||||
if (ctx->device->uma) {
|
if (ctx->device->uma) {
|
||||||
@ -5594,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|||||||
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
||||||
(uint32_t)ggml_nelements(dst), 0,
|
(uint32_t)ggml_nelements(dst), 0, 0,
|
||||||
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
||||||
sf0, sf1, sf2, sf3,
|
sf0, sf1, sf2, sf3,
|
||||||
@ -5704,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||||||
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
d_offset,
|
0,
|
||||||
0.0f, 0.0f,
|
0.0f, 0.0f,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
}, dryrun);
|
}, dryrun);
|
||||||
|
@ -21,9 +21,9 @@ void main() {
|
|||||||
get_indices(idx, i00, i01, i02, i03);
|
get_indices(idx, i00, i01, i02, i03);
|
||||||
|
|
||||||
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
||||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||||
} else {
|
} else {
|
||||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
|
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ void main() {
|
|||||||
uint i00, i01, i02, i03;
|
uint i00, i01, i02, i03;
|
||||||
get_indices(idx, i00, i01, i02, i03);
|
get_indices(idx, i00, i01, i02, i03);
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||||
|
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,6 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||||
}
|
}
|
||||||
|
@ -30,12 +30,12 @@ void main() {
|
|||||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||||
|
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
|
||||||
#else
|
#else
|
||||||
if (is_src0) {
|
if (is_src0) {
|
||||||
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
|
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
|
||||||
} else {
|
} else {
|
||||||
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
|
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -19,9 +19,9 @@ void main() {
|
|||||||
if (idx + (num_iter-1)*num_threads < p.ne) {
|
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||||
#else
|
#else
|
||||||
data_d[p.d_offset + idx] = data_a[idx];
|
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||||
#endif
|
#endif
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
@ -32,9 +32,9 @@ void main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||||
#else
|
#else
|
||||||
data_d[p.d_offset + idx] = data_a[idx];
|
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||||
#endif
|
#endif
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
|
@ -13,8 +13,8 @@ void main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||||
#else
|
#else
|
||||||
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
|
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,6 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
|
|||||||
const float16_t d = bl.block.d;
|
const float16_t d = bl.block.d;
|
||||||
const uint idx = coordInBlock[1];
|
const uint idx = coordInBlock[1];
|
||||||
const uint shift = (idx & 0x10) >> 2;
|
const uint shift = (idx & 0x10) >> 2;
|
||||||
uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
|
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
|
||||||
qs >>= shift;
|
qs >>= shift;
|
||||||
qs &= 0xF;
|
qs &= 0x0F0F;
|
||||||
|
qs = unpack8(qs)[idx & 1];
|
||||||
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
|
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
|
|||||||
block_q4_K block;
|
block_q4_K block;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
|
||||||
|
block_q4_K_packed16 block;
|
||||||
|
};
|
||||||
|
|
||||||
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||||
{
|
{
|
||||||
|
decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
|
||||||
const uint idx = coordInBlock[1];
|
const uint idx = coordInBlock[1];
|
||||||
const uint iqs = idx;
|
|
||||||
|
|
||||||
const uint n = iqs / 64; // 0,1,2,3
|
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||||
const uint b = (iqs % 64) / 32; // 0,1
|
|
||||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||||
const uint qsi = n * 32 + (iqs % 32); // 0..127
|
|
||||||
|
|
||||||
const f16vec2 loadd = bl.block.d;
|
const f16vec2 loadd = bl.block.d;
|
||||||
|
|
||||||
@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
|
|||||||
const float16_t d = loadd.x * float16_t(sc);
|
const float16_t d = loadd.x * float16_t(sc);
|
||||||
const float16_t m = loadd.y * float16_t(mbyte);
|
const float16_t m = loadd.y * float16_t(mbyte);
|
||||||
|
|
||||||
uint32_t dmask = 0xF << (b * 4);
|
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||||
|
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||||
|
qs = unpack8(qs)[idx & 1];
|
||||||
|
|
||||||
float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m;
|
float16_t ret = d * float16_t(qs) - m;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
|
|||||||
block_q5_K block;
|
block_q5_K block;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
|
||||||
|
block_q5_K_packed16 block;
|
||||||
|
};
|
||||||
|
|
||||||
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||||
{
|
{
|
||||||
|
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
|
||||||
const uint idx = coordInBlock[1];
|
const uint idx = coordInBlock[1];
|
||||||
const uint iqs = idx;
|
|
||||||
|
|
||||||
const uint n = iqs / 64; // 0,1,2,3
|
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||||
const uint b = (iqs % 64) / 32; // 0,1
|
|
||||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||||
const uint qsi = n * 32 + (iqs % 32); // 0..127
|
|
||||||
const uint qhi = (iqs % 32); // 0..31
|
|
||||||
|
|
||||||
const uint8_t hm = uint8_t(1 << (iqs / 32));
|
const uint32_t hm = 0x0101 << is;
|
||||||
|
|
||||||
const f16vec2 loadd = bl.block.d;
|
const f16vec2 loadd = bl.block.d;
|
||||||
|
|
||||||
@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
|
|||||||
const float16_t d = loadd.x * float16_t(sc);
|
const float16_t d = loadd.x * float16_t(sc);
|
||||||
const float16_t m = loadd.y * float16_t(mbyte);
|
const float16_t m = loadd.y * float16_t(mbyte);
|
||||||
|
|
||||||
uint32_t dmask = 0xF << (b * 4);
|
uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
|
||||||
|
qh = qh & hm;
|
||||||
|
qh = unpack8(qh)[idx & 1];
|
||||||
|
|
||||||
float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m;
|
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||||
|
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||||
|
qs = unpack8(qs)[idx & 1];
|
||||||
|
|
||||||
|
float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_
|
|||||||
block_q6_K block;
|
block_q6_K block;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
|
||||||
|
block_q6_K_packed16 block;
|
||||||
|
};
|
||||||
|
|
||||||
float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||||
{
|
{
|
||||||
|
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
|
||||||
const uint idx = coordInBlock[1];
|
const uint idx = coordInBlock[1];
|
||||||
const uint iqs = idx;
|
|
||||||
|
|
||||||
const uint n = iqs / 128; // 0,1
|
const uint b = (idx & 0x40) >> 6; // 0,1
|
||||||
const uint b = (iqs % 128) / 64; // 0,1
|
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
|
||||||
const uint is_b = (iqs % 32) / 16; // 0,1
|
const uint is = (idx & 0xF0) >> 4; // 0..15
|
||||||
const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
|
|
||||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
|
||||||
const uint qsi = n * 64 + (iqs % 64); // 0..127
|
|
||||||
const uint qhi = n * 32 + (iqs % 32); // 0..63
|
|
||||||
|
|
||||||
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
||||||
|
|
||||||
float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32);
|
uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
|
||||||
|
ql = (ql >> (b * 4)) & 0x0F0F;
|
||||||
|
|
||||||
|
uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
|
||||||
|
qh = ((qh >> qhshift) & 0x0303) << 4;
|
||||||
|
|
||||||
|
int q = unpack8(ql | qh)[idx & 1];
|
||||||
|
|
||||||
|
float16_t ret = dscale * float16_t(q - 32);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@ void main() {
|
|||||||
uint i00, i01, i02, i03;
|
uint i00, i01, i02, i03;
|
||||||
get_indices(idx, i00, i01, i02, i03);
|
get_indices(idx, i00, i01, i02, i03);
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||||
|
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
|||||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||||
uint d_offset;
|
uint misalign_offsets;
|
||||||
float param1; float param2; int param3;
|
float param1; float param2; int param3;
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
@ -22,6 +22,10 @@ uint get_idx() {
|
|||||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||||
|
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
|
||||||
|
uint get_doffset() { return p.misalign_offsets & 0xFF; }
|
||||||
|
|
||||||
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
||||||
uint fastmod(uint a, uint b) {
|
uint fastmod(uint a, uint b) {
|
||||||
if ((b & (b-1)) == 0) {
|
if ((b & (b-1)) == 0) {
|
||||||
|
@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
|
|||||||
uint ne;
|
uint ne;
|
||||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||||
uint d_offset;
|
uint misalign_offsets;
|
||||||
float param1; float param2;
|
float param1; float param2;
|
||||||
|
|
||||||
uint ne0_012mp; uint ne0_012L;
|
uint ne0_012mp; uint ne0_012L;
|
||||||
@ -24,6 +24,9 @@ uint get_idx() {
|
|||||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||||
|
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||||
|
|
||||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||||
uint fastdiv(uint n, uint mp, uint L) {
|
uint fastdiv(uint n, uint mp, uint L) {
|
||||||
uint msbs, lsbs;
|
uint msbs, lsbs;
|
||||||
|
@ -15,10 +15,10 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||||
|
|
||||||
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||||
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||||
|
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
#extension GL_EXT_spirv_intrinsics: enable
|
#extension GL_EXT_spirv_intrinsics: enable
|
||||||
|
#extension GL_EXT_control_flow_attributes : require
|
||||||
|
|
||||||
#if RTE16
|
#if RTE16
|
||||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||||
@ -23,40 +24,64 @@ layout (push_constant) uniform parameter
|
|||||||
|
|
||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
|
|
||||||
#define BLOCK_SIZE 256
|
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
|
||||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
const uint NUM_ITER = 512 / BLOCK_SIZE;
|
||||||
|
|
||||||
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_GlobalInvocationID.x;
|
const uint gidx = gl_GlobalInvocationID.x;
|
||||||
if (i >= p.pelements) {
|
|
||||||
return;
|
const uint oh = gl_GlobalInvocationID.y;
|
||||||
|
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||||
|
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||||
|
|
||||||
|
A_TYPE values[NUM_ITER];
|
||||||
|
uint offset_dst[NUM_ITER];
|
||||||
|
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||||
|
values[idx] = A_TYPE(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||||
|
|
||||||
|
const uint i = gidx * NUM_ITER + idx;
|
||||||
|
|
||||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||||
const uint kx = i / ksize;
|
const uint kx = i / ksize;
|
||||||
const uint kd = kx * ksize;
|
const uint kd = kx * ksize;
|
||||||
const uint ky = (i - kd) / p.OW;
|
const uint ky = (i - kd) / p.OW;
|
||||||
const uint ix = i % p.OW;
|
const uint ix = i % p.OW;
|
||||||
|
|
||||||
const uint oh = gl_GlobalInvocationID.y;
|
|
||||||
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
|
||||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
|
||||||
|
|
||||||
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||||
|
|
||||||
const uint offset_dst =
|
offset_dst[idx] =
|
||||||
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||||
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||||
|
|
||||||
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
|
if (i >= p.pelements) {
|
||||||
data_d[offset_dst] = D_TYPE(0.0f);
|
continue;
|
||||||
} else {
|
|
||||||
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
|
||||||
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (iih < p.IH && iiw < p.IW) {
|
||||||
|
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||||
|
values[idx] = data_a[offset_src + iih * p.IW + iiw];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||||
|
|
||||||
|
const uint i = gidx * NUM_ITER + idx;
|
||||||
|
|
||||||
|
if (i >= p.pelements) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@ void main() {
|
|||||||
uint i00, i01, i02, i03;
|
uint i00, i01, i02, i03;
|
||||||
get_indices(idx, i00, i01, i02, i03);
|
get_indices(idx, i00, i01, i02, i03);
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||||
|
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
|
@ -6,21 +6,15 @@
|
|||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
|
||||||
|
|
||||||
if (row >= p.stride_d) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
|
||||||
|
|
||||||
// 16 threads are used to process each block
|
// 16 threads are used to process each block
|
||||||
const uint it_size = gl_WorkGroupSize.x/16;
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
@ -38,15 +32,15 @@ void main() {
|
|||||||
const uint s_offset = 8*v_im;
|
const uint s_offset = 8*v_im;
|
||||||
const uint y_offset = 128*v_im + l0;
|
const uint y_offset = 128*v_im + l0;
|
||||||
|
|
||||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp[NUM_ROWS];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
f16vec2 d = data_a[ib0 + i].d;
|
|
||||||
const FLOAT_TYPE dall = d.x;
|
|
||||||
const FLOAT_TYPE dmin = d.y;
|
|
||||||
|
|
||||||
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
||||||
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
||||||
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
||||||
@ -56,6 +50,12 @@ void main() {
|
|||||||
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
||||||
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
||||||
|
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
f16vec2 d = data_a[ib0 + i].d;
|
||||||
|
const FLOAT_TYPE dall = d.x;
|
||||||
|
const FLOAT_TYPE dmin = d.y;
|
||||||
|
|
||||||
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
|
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
|
||||||
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
|
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
|
||||||
|
|
||||||
@ -94,20 +94,40 @@ void main() {
|
|||||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
|
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
|
||||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
|
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
|
||||||
}
|
}
|
||||||
temp = fma(dall, sum1, fma(-dmin, sum2, temp));
|
temp[n] = fma(dall, sum1, fma(-dmin, sum2, temp[n]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[gl_LocalInvocationID.x] = temp;
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] = temp[n];
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,21 +6,15 @@
|
|||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
|
||||||
|
|
||||||
if (row >= p.stride_d) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
|
||||||
|
|
||||||
// 16 threads are used to process each block
|
// 16 threads are used to process each block
|
||||||
const uint it_size = gl_WorkGroupSize.x/16;
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
@ -39,15 +33,17 @@ void main() {
|
|||||||
const uint q_offset = 32*v_im + l0;
|
const uint q_offset = 32*v_im + l0;
|
||||||
const uint y_offset = 128*v_im + l0;
|
const uint y_offset = 128*v_im + l0;
|
||||||
|
|
||||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp[NUM_ROWS];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
|
|
||||||
const uint s_shift = 4 * v_im;
|
const uint s_shift = 4 * v_im;
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
|
||||||
|
|
||||||
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
|
||||||
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
|
||||||
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
|
||||||
@ -57,6 +53,10 @@ void main() {
|
|||||||
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
|
||||||
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
|
||||||
|
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||||
|
|
||||||
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
|
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
|
||||||
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
|
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
|
||||||
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
|
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
|
||||||
@ -81,20 +81,40 @@ void main() {
|
|||||||
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
||||||
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
||||||
}
|
}
|
||||||
temp = fma(d, sum, temp);
|
temp[n] = fma(d, sum, temp[n]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[gl_LocalInvocationID.x] = temp;
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] = temp[n];
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,21 +7,15 @@
|
|||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
|
||||||
|
|
||||||
if (row >= p.stride_d) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
|
||||||
|
|
||||||
// 16 threads are used to process each block
|
// 16 threads are used to process each block
|
||||||
const uint it_size = gl_WorkGroupSize.x/16;
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
@ -42,12 +36,23 @@ void main() {
|
|||||||
const uint q_offset = 32*v_im + l0;
|
const uint q_offset = 32*v_im + l0;
|
||||||
const uint y_offset = 64*v_im + l0;
|
const uint y_offset = 64*v_im + l0;
|
||||||
|
|
||||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp[NUM_ROWS];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||||
const uint y1_idx = i * QUANT_K + y_offset;
|
const uint y1_idx = i * QUANT_K + y_offset;
|
||||||
const uint y2_idx = y1_idx + 128;
|
const uint y2_idx = y1_idx + 128;
|
||||||
|
|
||||||
|
B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4];
|
||||||
|
B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
|
||||||
|
B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4];
|
||||||
|
B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
|
||||||
|
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
f16vec2 d = data_a[ib0 + i].d;
|
f16vec2 d = data_a[ib0 + i].d;
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||||
@ -98,11 +103,6 @@ void main() {
|
|||||||
const uint32_t q4_14 = qs64_hi4.z;
|
const uint32_t q4_14 = qs64_hi4.z;
|
||||||
const uint32_t q4_15 = qs64_hi4.w;
|
const uint32_t q4_15 = qs64_hi4.w;
|
||||||
|
|
||||||
B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4];
|
|
||||||
B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
|
|
||||||
B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4];
|
|
||||||
B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
|
|
||||||
|
|
||||||
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
|
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
|
||||||
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
|
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
|
||||||
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
|
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
|
||||||
@ -112,20 +112,40 @@ void main() {
|
|||||||
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
|
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
|
||||||
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
|
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
|
||||||
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
|
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
|
||||||
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
|
temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[gl_LocalInvocationID.x] = temp;
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] = temp[n];
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,21 +7,15 @@
|
|||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
|
||||||
|
|
||||||
if (row >= p.stride_d) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
|
||||||
|
|
||||||
// 16 threads are used to process each block
|
// 16 threads are used to process each block
|
||||||
const uint it_size = gl_WorkGroupSize.x/16;
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
@ -39,12 +33,27 @@ void main() {
|
|||||||
const uint q_offset = 32*v_im + l0;
|
const uint q_offset = 32*v_im + l0;
|
||||||
const uint y_offset = 64*v_im + l0;
|
const uint y_offset = 64*v_im + l0;
|
||||||
|
|
||||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp[NUM_ROWS];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||||
const uint y1_idx = i * QUANT_K + y_offset;
|
const uint y1_idx = i * QUANT_K + y_offset;
|
||||||
const uint y2_idx = y1_idx + 128;
|
const uint y2_idx = y1_idx + 128;
|
||||||
|
|
||||||
|
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
|
||||||
|
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
|
||||||
|
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
|
||||||
|
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
|
||||||
|
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
|
||||||
|
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
|
||||||
|
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
|
||||||
|
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
|
||||||
|
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
f16vec2 d = data_a[ib0 + i].d;
|
f16vec2 d = data_a[ib0 + i].d;
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||||
@ -107,15 +116,6 @@ void main() {
|
|||||||
const uint32_t q4_14 = qs64_80_hi4.z;
|
const uint32_t q4_14 = qs64_80_hi4.z;
|
||||||
const uint32_t q4_15 = qs64_80_hi4.w;
|
const uint32_t q4_15 = qs64_80_hi4.w;
|
||||||
|
|
||||||
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
|
|
||||||
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
|
|
||||||
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
|
|
||||||
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
|
|
||||||
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
|
|
||||||
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
|
|
||||||
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
|
|
||||||
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
|
|
||||||
|
|
||||||
const FLOAT_TYPE sx =
|
const FLOAT_TYPE sx =
|
||||||
fma(FLOAT_TYPE(by10.x), q4_0,
|
fma(FLOAT_TYPE(by10.x), q4_0,
|
||||||
fma(FLOAT_TYPE(by10.y), q4_1,
|
fma(FLOAT_TYPE(by10.y), q4_1,
|
||||||
@ -141,20 +141,40 @@ void main() {
|
|||||||
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
||||||
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
||||||
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
||||||
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
|
temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[gl_LocalInvocationID.x] = temp;
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] = temp[n];
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,21 +7,15 @@
|
|||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
|
||||||
|
|
||||||
if (row >= p.stride_d) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
|
||||||
|
|
||||||
// 16 threads are used to process each block
|
// 16 threads are used to process each block
|
||||||
const uint it_size = gl_WorkGroupSize.x/16;
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
@ -42,11 +36,22 @@ void main() {
|
|||||||
const uint s_offset = 8*v_im + is;
|
const uint s_offset = 8*v_im + is;
|
||||||
const uint y_offset = 128*v_im + l0;
|
const uint y_offset = 128*v_im + l0;
|
||||||
|
|
||||||
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp[NUM_ROWS];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||||
|
temp[i] = FLOAT_TYPE(0);
|
||||||
|
}
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
|
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
|
||||||
|
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
|
||||||
|
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
|
||||||
|
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
|
||||||
|
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||||
|
|
||||||
FLOAT_TYPE scales[4];
|
FLOAT_TYPE scales[4];
|
||||||
@ -79,11 +84,6 @@ void main() {
|
|||||||
uvec4 q2 = uvec4(unpack8(q2_u32));
|
uvec4 q2 = uvec4(unpack8(q2_u32));
|
||||||
uvec4 q3 = uvec4(unpack8(q3_u32));
|
uvec4 q3 = uvec4(unpack8(q3_u32));
|
||||||
|
|
||||||
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
|
|
||||||
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
|
|
||||||
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
|
|
||||||
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
|
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||||
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
||||||
@ -91,20 +91,40 @@ void main() {
|
|||||||
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
||||||
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
||||||
}
|
}
|
||||||
temp += sum * d;
|
temp[n] += sum * d;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[gl_LocalInvocationID.x] = temp;
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] = temp[n];
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
tmpsh[n][tid] += tmpsh[n][tid + s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
data_d[d_offset + row] = D_TYPE(tmp[0]);
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,5 +24,5 @@ void main() {
|
|||||||
|
|
||||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
|
||||||
}
|
}
|
||||||
|
@ -22,5 +22,5 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ void main() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
|
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
|
||||||
idx += num_threads;
|
idx += num_threads;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,6 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,6 @@ void main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
layout (push_constant) uniform parameter
|
layout (push_constant) uniform parameter
|
||||||
{
|
{
|
||||||
uint ne; uint d_offset;
|
uint ne; uint a_offset; uint d_offset;
|
||||||
uint nb00; uint nb01; uint nb02; uint nb03;
|
uint nb00; uint nb01; uint nb02; uint nb03;
|
||||||
uint ne10; uint ne11; uint ne12; uint ne13;
|
uint ne10; uint ne11; uint ne12; uint ne13;
|
||||||
float sf0; float sf1; float sf2; float sf3;
|
float sf0; float sf1; float sf2; float sf3;
|
||||||
@ -32,5 +32,5 @@ void main() {
|
|||||||
const uint i02 = uint(i12 / p.sf2);
|
const uint i02 = uint(i12 / p.sf2);
|
||||||
const uint i03 = uint(i13 / p.sf3);
|
const uint i03 = uint(i13 / p.sf3);
|
||||||
|
|
||||||
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||||
}
|
}
|
||||||
|
@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
|
|||||||
}
|
}
|
||||||
|
|
||||||
PROCESS_INFORMATION pi;
|
PROCESS_INFORMATION pi;
|
||||||
STARTUPINFOA si = { sizeof(STARTUPINFOA) };
|
STARTUPINFOA si = {};
|
||||||
|
si.cb = sizeof(STARTUPINFOA);
|
||||||
si.dwFlags = STARTF_USESTDHANDLES;
|
si.dwFlags = STARTF_USESTDHANDLES;
|
||||||
si.hStdOutput = stdout_write;
|
si.hStdOutput = stdout_write;
|
||||||
si.hStdError = stderr_write;
|
si.hStdError = stderr_write;
|
||||||
|
@ -221,6 +221,7 @@ class GGUFType:
|
|||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
|
DECI = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GROK = auto()
|
GROK = auto()
|
||||||
@ -402,6 +403,7 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
|
MODEL_ARCH.DECI: "deci",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||||
MODEL_ARCH.GROK: "grok",
|
MODEL_ARCH.GROK: "grok",
|
||||||
@ -602,6 +604,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DECI: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GROK: [
|
MODEL_ARCH.GROK: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
@ -1448,6 +1470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DECI: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
MODEL_ARCH.BAICHUAN: [
|
MODEL_ARCH.BAICHUAN: [
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
@ -198,6 +198,7 @@ class TensorNameMap:
|
|||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
||||||
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
|
@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
|
|||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
||||||
|
|
||||||
|
commit_short_len = len(builds[0][0])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
repo = git.Repo(".", search_parent_directories=True)
|
repo = git.Repo(".", search_parent_directories=True)
|
||||||
except git.InvalidGitRepositoryError:
|
except git.InvalidGitRepositoryError:
|
||||||
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
|
|||||||
seen_hexsha8 = set()
|
seen_hexsha8 = set()
|
||||||
while heap:
|
while heap:
|
||||||
depth, current_commit = heapq.heappop(heap)
|
depth, current_commit = heapq.heappop(heap)
|
||||||
current_hexsha8 = commit.hexsha[:8]
|
current_hexsha8 = commit.hexsha[:commit_short_len]
|
||||||
if (current_hexsha8,) in builds:
|
if (current_hexsha8,) in builds:
|
||||||
return current_hexsha8
|
return current_hexsha8
|
||||||
for parent in commit.parents:
|
for parent in commit.parents:
|
||||||
parent_hexsha8 = parent.hexsha[:8]
|
parent_hexsha8 = parent.hexsha[:commit_short_len]
|
||||||
if parent_hexsha8 not in seen_hexsha8:
|
if parent_hexsha8 not in seen_hexsha8:
|
||||||
seen_hexsha8.add(parent_hexsha8)
|
seen_hexsha8.add(parent_hexsha8)
|
||||||
heapq.heappush(heap, (depth + 1, parent))
|
heapq.heappush(heap, (depth + 1, parent))
|
||||||
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
|
|||||||
|
|
||||||
while unvisited:
|
while unvisited:
|
||||||
current_commit = unvisited.pop(0)
|
current_commit = unvisited.pop(0)
|
||||||
visited.append(current_commit.hexsha[:8])
|
visited.append(current_commit.hexsha[:commit_short_len])
|
||||||
for parent in current_commit.parents:
|
for parent in current_commit.parents:
|
||||||
if parent.hexsha[:8] not in visited:
|
if parent.hexsha[:commit_short_len] not in visited:
|
||||||
unvisited.append(parent)
|
unvisited.append(parent)
|
||||||
|
|
||||||
return visited
|
return visited
|
||||||
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
|
|||||||
if repo is None:
|
if repo is None:
|
||||||
return hexsha8
|
return hexsha8
|
||||||
for h in repo.heads:
|
for h in repo.heads:
|
||||||
if h.commit.hexsha[:8] == hexsha8:
|
if h.commit.hexsha[:commit_short_len] == hexsha8:
|
||||||
return h.name
|
return h.name
|
||||||
for t in repo.tags:
|
for t in repo.tags:
|
||||||
if t.commit.hexsha[:8] == hexsha8:
|
if t.commit.hexsha[:commit_short_len] == hexsha8:
|
||||||
return t.name
|
return t.name
|
||||||
return hexsha8
|
return hexsha8
|
||||||
|
|
||||||
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
|
|||||||
return None
|
return None
|
||||||
for h in repo.heads:
|
for h in repo.heads:
|
||||||
if h.name == name:
|
if h.name == name:
|
||||||
return h.commit.hexsha[:8]
|
return h.commit.hexsha[:commit_short_len]
|
||||||
for t in repo.tags:
|
for t in repo.tags:
|
||||||
if t.name == name:
|
if t.name == name:
|
||||||
return t.commit.hexsha[:8]
|
return t.commit.hexsha[:commit_short_len]
|
||||||
for c in repo.iter_commits("--all"):
|
for c in repo.iter_commits("--all"):
|
||||||
if c.hexsha[:8] == name[:8]:
|
if c.hexsha[:commit_short_len] == name[:commit_short_len]:
|
||||||
return c.hexsha[:8]
|
return c.hexsha[:commit_short_len]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ function has_cmd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if has_cmd wget; then
|
if has_cmd wget; then
|
||||||
cmd="wget -q --show-progress -c -O %s/%s %s"
|
cmd="wget -q -c -O %s/%s %s"
|
||||||
elif has_cmd curl; then
|
elif has_cmd curl; then
|
||||||
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||||
else
|
else
|
||||||
|
@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_bos_id;
|
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
||||||
|
@ -45,7 +45,7 @@ struct llama_vocab {
|
|||||||
id special_unk_id = 0;
|
id special_unk_id = 0;
|
||||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
id special_cls_id = LLAMA_TOKEN_NULL;
|
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
id linefeed_id = 13;
|
id linefeed_id = 13;
|
||||||
|
300
src/llama.cpp
300
src/llama.cpp
@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
|
|||||||
|
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
|
LLM_ARCH_DECI,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
LLM_ARCH_GROK,
|
LLM_ARCH_GROK,
|
||||||
@ -203,6 +204,7 @@ enum llm_arch {
|
|||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
{ LLM_ARCH_GROK, "grok" },
|
{ LLM_ARCH_GROK, "grok" },
|
||||||
{ LLM_ARCH_GPT2, "gpt2" },
|
{ LLM_ARCH_GPT2, "gpt2" },
|
||||||
@ -674,6 +676,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_DECI,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
{
|
{
|
||||||
@ -1673,6 +1701,7 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
LLM_CHAT_TEMPLATE_ZEPHYR,
|
LLM_CHAT_TEMPLATE_ZEPHYR,
|
||||||
LLM_CHAT_TEMPLATE_MONARCH,
|
LLM_CHAT_TEMPLATE_MONARCH,
|
||||||
LLM_CHAT_TEMPLATE_GEMMA,
|
LLM_CHAT_TEMPLATE_GEMMA,
|
||||||
@ -1691,6 +1720,7 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||||
LLM_CHAT_TEMPLATE_GRANITE,
|
LLM_CHAT_TEMPLATE_GRANITE,
|
||||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||||
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1705,6 +1735,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
||||||
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
||||||
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
||||||
@ -1723,6 +1754,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||||
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
||||||
};
|
};
|
||||||
|
|
||||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
@ -5692,7 +5724,7 @@ static void llm_load_hparams(
|
|||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||||
}
|
}
|
||||||
@ -5732,6 +5764,15 @@ static void llm_load_hparams(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 80: model.type = e_model::MODEL_70B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
@ -6562,7 +6603,8 @@ static void llm_load_vocab(
|
|||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
tokenizer_pre == "llama-v3" ||
|
tokenizer_pre == "llama-v3" ||
|
||||||
tokenizer_pre == "llama-bpe") {
|
tokenizer_pre == "llama-bpe"||
|
||||||
|
tokenizer_pre == "falcon3") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
vocab.tokenizer_ignore_merges = true;
|
vocab.tokenizer_ignore_merges = true;
|
||||||
vocab.tokenizer_add_bos = true;
|
vocab.tokenizer_add_bos = true;
|
||||||
@ -6663,6 +6705,9 @@ static void llm_load_vocab(
|
|||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "minerva-7b") {
|
tokenizer_pre == "minerva-7b") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "megrez") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
@ -7936,6 +7981,68 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
||||||
|
const int64_t n_ff = hparams.n_ff(i);
|
||||||
|
const int64_t n_head = hparams.n_head(i);
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(i);
|
||||||
|
|
||||||
|
if (n_head_kv == 0 && n_head > 0) {
|
||||||
|
// linear attention for DeciLMCausalModel
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
}
|
||||||
|
else if (n_head_kv > 0) {
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||||
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
|
||||||
|
// optional MLP bias
|
||||||
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
{
|
{
|
||||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||||
@ -11305,6 +11412,167 @@ struct llm_build_context {
|
|||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_deci() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||||
|
int32_t n_tokens = this->n_tokens;
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
|
||||||
|
if (n_head == 0) {
|
||||||
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
|
cur = inpL;
|
||||||
|
} else {
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_head > 0 && n_head_kv == 0) {
|
||||||
|
// "linear attention" of Llama-3_1-Nemotron-51B
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
|
cb(cur, "wo", il);
|
||||||
|
} else if (n_head > 0) {
|
||||||
|
// self-attention
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
n_tokens = n_outputs;
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_residual_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
|
if (n_head > 0) {
|
||||||
|
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_residual_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_logit_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_baichuan() {
|
struct ggml_cgraph * build_baichuan() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
@ -17419,6 +17687,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
{
|
{
|
||||||
result = llm.build_llama();
|
result = llm.build_llama();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
result = llm.build_deci();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
{
|
{
|
||||||
result = llm.build_baichuan();
|
result = llm.build_baichuan();
|
||||||
@ -20794,6 +21066,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
@ -22615,6 +22888,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|||||||
}
|
}
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||||
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_FALCON_3;
|
||||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||||
@ -22661,6 +22936,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|||||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||||
|
} else if (tmpl_contains("<|role_start|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_MEGREZ;
|
||||||
}
|
}
|
||||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -22767,6 +23044,15 @@ static int32_t llama_chat_apply_template_internal(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
||||||
|
// Falcon 3
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|" << role << "|>\n" << message->content << "\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
||||||
// zephyr template
|
// zephyr template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
@ -23010,6 +23296,16 @@ static int32_t llama_chat_apply_template_internal(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "assistant<|role_sep|>";
|
ss << "assistant<|role_sep|>";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
|
||||||
|
// Megrez template
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|role_start|>assistant<|role_end|>";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -3945,6 +3945,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int K : {3, 5}) {
|
||||||
|
for (int IC : {256, 2560}) {
|
||||||
|
for (int IW_IH : {32, 64, 256}) {
|
||||||
|
if (IC == 2560 && IW_IH == 256) {
|
||||||
|
// too big
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return test_cases;
|
return test_cases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,6 +77,8 @@ int main(void) {
|
|||||||
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
||||||
// ai-sage/GigaChat-20B-A3B-instruct
|
// ai-sage/GigaChat-20B-A3B-instruct
|
||||||
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
|
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
|
||||||
|
// Infinigence/Megrez-3B-Instruct
|
||||||
|
u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct,将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
@ -133,6 +135,8 @@ int main(void) {
|
|||||||
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
|
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
|
||||||
// ai-sage/GigaChat-20B-A3B-instruct
|
// ai-sage/GigaChat-20B-A3B-instruct
|
||||||
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
|
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
|
||||||
|
// Infinigence/Megrez-3B-Instruct
|
||||||
|
"<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|> I am an assistant <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
Loading…
Reference in New Issue
Block a user