mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-24 18:39:19 +01:00
Introducing experimental OpenCL backend with support for Qualcomm Adreno GPUs (#10693)
* [cl][adreno] Add Adreno GPU support Add new OpenCL backend to support Adreno GPUs --------- Co-authored-by: Skyler Szot <quic_sszot@quicinc.com> Co-authored-by: Shangqing Gu <quic_shawngu@quicinc.com> Co-authored-by: Alexander Angus <quic_aangus@quicinc.com> Co-authored-by: Hongqiang Wang <quic_wangh@quicinc.com> Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com> * [cl][ci] Add workflow for CL * [cl][adreno] Fix memory leak for non SMALL_ALLOC path * opencl: integrate backend dyn.load interface and fix compiler and format warnings * opencl: remove small-alloc support and fix build errors for non-opencl platforms * opencl: fixed merge conflict (MUSA added twice in cmake) * opencl-ci: use RUNNER_TEMP instead of github.workspace * opencl: fix embed tool invocation with python3 * opencl: CI workflow fixes * opencl: Clean up small-alloc in CMake files * opencl: cleanup ggml-opencl2 header file * opencl: use ulong for offsets and strides in ADD kernel * opencl: use cl_ulong for all offsets * opencl: use cl_ulong for sizes and strides * opencl: use `GGML_LOG_xxx` instead of `fprintf(stderr, ...)` * opencl: rename backend `opencl2` -> `opencl` * opencl: rename kernel files `ggml-opencl2` -> `ggml-opencl` * opencl: make OpenCL required, remove redundant lib and inc directories * `ggml-base`, `..` and `.` are added by `ggml_add_backend_library` * opencl: rename backend - funcs, structs, etc `opencl2` -> `opencl` * opencl: remove copyright marker since main license already covers * opencl: replace some more OPENCL2 leftovers * opencl: remove limits on `tensor_extra` * opencl: use pools for `tensor_extra` * opencl: fix compiler warnings with GCC and Clang Still getting the warning about clCreateCmdQueue being obsolete. Will fix that separately. * opencl: fail gracefully if opencl devices are not available Also for unsupported GPUs. * opencl: fix MSVC builds (string length error) * opencl: check for various requirements, allow deprecated API * opencl: update log message for unsupported GPUs --------- Co-authored-by: Skyler Szot <quic_sszot@quicinc.com> Co-authored-by: Shangqing Gu <quic_shawngu@quicinc.com> Co-authored-by: Alexander Angus <quic_aangus@quicinc.com> Co-authored-by: Hongqiang Wang <quic_wangh@quicinc.com> Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com>
This commit is contained in:
parent
c27ac678dd
commit
a76c56fa1a
26
.github/workflows/build.yml
vendored
26
.github/workflows/build.yml
vendored
@ -662,6 +662,8 @@ jobs:
|
|||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
- build: 'llvm-arm64-opencl-adreno'
|
||||||
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -703,6 +705,28 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
choco install ninja
|
choco install ninja
|
||||||
|
|
||||||
|
- name: Install OpenCL Headers and Libs
|
||||||
|
id: install_opencl
|
||||||
|
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||||
|
cd OpenCL-Headers
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake .. `
|
||||||
|
-DBUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build . --target install
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||||
|
cd OpenCL-ICD-Loader
|
||||||
|
mkdir build-arm64-release && cd build-arm64-release
|
||||||
|
cmake .. `
|
||||||
|
-A arm64 `
|
||||||
|
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build . --target install --config release
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
@ -732,7 +756,7 @@ jobs:
|
|||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
@ -179,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|||||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||||
"ggml: sycl device architecture")
|
"ggml: sycl device architecture")
|
||||||
|
|
||||||
|
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||||
|
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||||
|
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||||
|
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||||
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||||
|
26
ggml/include/ggml-opencl.h
Normal file
26
ggml/include/ggml-opencl.h
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#ifndef GGML_OPENCL_H
|
||||||
|
#define GGML_OPENCL_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend API
|
||||||
|
//
|
||||||
|
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||||
|
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // GGML_OPENCL_H
|
@ -308,6 +308,7 @@ ggml_add_backend(MUSA)
|
|||||||
ggml_add_backend(RPC)
|
ggml_add_backend(RPC)
|
||||||
ggml_add_backend(SYCL)
|
ggml_add_backend(SYCL)
|
||||||
ggml_add_backend(Vulkan)
|
ggml_add_backend(Vulkan)
|
||||||
|
ggml_add_backend(OpenCL)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||||
|
@ -46,6 +46,10 @@
|
|||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_OPENCL
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#endif
|
#endif
|
||||||
@ -146,6 +150,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
register_backend(ggml_backend_vk_reg());
|
register_backend(ggml_backend_vk_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_OPENCL
|
||||||
|
register_backend(ggml_backend_opencl_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
register_backend(ggml_backend_cann_reg());
|
register_backend(ggml_backend_cann_reg());
|
||||||
#endif
|
#endif
|
||||||
@ -539,6 +546,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|||||||
ggml_backend_load_best("rpc", silent, dir_path);
|
ggml_backend_load_best("rpc", silent, dir_path);
|
||||||
ggml_backend_load_best("sycl", silent, dir_path);
|
ggml_backend_load_best("sycl", silent, dir_path);
|
||||||
ggml_backend_load_best("vulkan", silent, dir_path);
|
ggml_backend_load_best("vulkan", silent, dir_path);
|
||||||
|
ggml_backend_load_best("opencl", silent, dir_path);
|
||||||
ggml_backend_load_best("musa", silent, dir_path);
|
ggml_backend_load_best("musa", silent, dir_path);
|
||||||
ggml_backend_load_best("cpu", silent, dir_path);
|
ggml_backend_load_best("cpu", silent, dir_path);
|
||||||
}
|
}
|
||||||
|
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
find_package(OpenCL REQUIRED)
|
||||||
|
find_package(Python3 REQUIRED)
|
||||||
|
|
||||||
|
set(TARGET_NAME ggml-opencl)
|
||||||
|
|
||||||
|
ggml_add_backend_library(${TARGET_NAME}
|
||||||
|
ggml-opencl.cpp
|
||||||
|
../../include/ggml-opencl.h)
|
||||||
|
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
|
||||||
|
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
|
||||||
|
|
||||||
|
if (GGML_OPENCL_PROFILING)
|
||||||
|
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
|
||||||
|
add_compile_definitions(GGML_OPENCL_PROFILING)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||||
|
|
||||||
|
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||||
|
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||||
|
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (GGML_OPENCL_EMBED_KERNELS)
|
||||||
|
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
|
||||||
|
|
||||||
|
set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
|
||||||
|
set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
|
||||||
|
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
|
||||||
|
|
||||||
|
set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
|
||||||
|
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
|
||||||
|
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
|
||||||
|
|
||||||
|
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
||||||
|
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||||
|
|
||||||
|
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
|
||||||
|
|
||||||
|
# Python must be accessible from command line
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
|
||||||
|
${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
|
||||||
|
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_mm.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
|
||||||
|
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_cvt.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
|
||||||
|
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
|
||||||
|
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
|
||||||
|
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
|
||||||
|
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
target_sources(${TARGET_NAME} PRIVATE
|
||||||
|
${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
|
||||||
|
else ()
|
||||||
|
# copy ggml-opencl.cl to bin directory
|
||||||
|
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
|
||||||
|
|
||||||
|
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
|
||||||
|
endif ()
|
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("opencl-embed-kernel")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
ifile = open(sys.argv[1], "r")
|
||||||
|
ofile = open(sys.argv[2], "w")
|
||||||
|
|
||||||
|
for i in ifile:
|
||||||
|
ofile.write('R"({})"\n'.format(i))
|
||||||
|
|
||||||
|
ifile.close()
|
||||||
|
ofile.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
File diff suppressed because it is too large
Load Diff
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// This file is contains additional kernels for data conversion.
|
||||||
|
// These kernels are used when loading the model, so its performance is less
|
||||||
|
// important.
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
#ifdef cl_khr_fp16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#elif defined(cl_amd_fp16)
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||||
|
#else
|
||||||
|
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_khr_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#elif defined(cl_intel_subgroups)
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#error "Subgroup not supported on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
// Always use subgroup size of 32 on Intel.
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
// Always use subgroups size of 64 on Adreno.
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#else
|
||||||
|
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||||
|
#error "Selecting subgroup size is not supported on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
#define QR4_0 2
|
||||||
|
#define QK4_1 32
|
||||||
|
#define QR4_1 2
|
||||||
|
#define QK5_0 32
|
||||||
|
#define QR5_0 2
|
||||||
|
#define QK5_1 32
|
||||||
|
#define QR5_1 2
|
||||||
|
#define QK8_0 32
|
||||||
|
#define QR8_0 1
|
||||||
|
#define QK_K 256
|
||||||
|
#define K_QUANTS_PER_ITERATION 2
|
||||||
|
|
||||||
|
typedef char int8_t;
|
||||||
|
typedef uchar uint8_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
typedef ushort uint16_t;
|
||||||
|
typedef int int32_t;
|
||||||
|
typedef uint uint32_t;
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// block_q4_0
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
struct block_q4_0
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK4_0 / 2];
|
||||||
|
};
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// mul_vec_q_n_f32_flat_noshuffle
|
||||||
|
//
|
||||||
|
// This variation uses flat arrays (struct of arrays, SOA) representation for
|
||||||
|
// quant tensors. It also uses non shuffled bit order for weights.
|
||||||
|
//
|
||||||
|
// The shuffled version is kept in the original file because moving it here
|
||||||
|
// seems to result in worse performance for adreno.
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
kernel void kernel_convert_block_q4_0_noshuffle(
|
||||||
|
global struct block_q4_0 * src0,
|
||||||
|
global uchar * dst_q,
|
||||||
|
global half * dst_d
|
||||||
|
) {
|
||||||
|
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
||||||
|
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
||||||
|
global half * d = (global half *) dst_d + get_global_id(0);
|
||||||
|
|
||||||
|
*d = b->d;
|
||||||
|
for (int i = 0; i < QK4_0/4; ++i) {
|
||||||
|
uchar x0 = b->qs[2*i + 0];
|
||||||
|
uchar x1 = b->qs[2*i + 1];
|
||||||
|
|
||||||
|
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||||
|
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||||
|
|
||||||
|
#ifdef ADRENO_GPU
|
||||||
|
// Workaround for adreno - must have the following printf statement for
|
||||||
|
// the kernel to work properly. Otherwise it produces incorrect result.
|
||||||
|
// convert_uchar above also seems necessary.
|
||||||
|
// Compare against a large number so that it does not print anything.
|
||||||
|
// get_sub_group_local_id() also works.
|
||||||
|
if (get_global_id(0) == 65536*4096) {
|
||||||
|
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
// assume
|
||||||
|
#define QK4_0 32
|
||||||
|
#define N_SIMDGROUP 4
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||||
|
float shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||||
|
float8 shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
__kernel void kernel_gemv_noshuffle(
|
||||||
|
__read_only image1d_buffer_t src0_q, // quantized A
|
||||||
|
global half2 * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B
|
||||||
|
ulong offset1, // offset to B (0)
|
||||||
|
global float * dst, // C
|
||||||
|
ulong offsetd, // offset to C (0)
|
||||||
|
uint K, // K
|
||||||
|
int ne01, // M
|
||||||
|
int ne02, // 1
|
||||||
|
int ne10, // K
|
||||||
|
int ne12, // 1
|
||||||
|
int ne0, // M
|
||||||
|
int ne1, // N
|
||||||
|
int r2, // 1
|
||||||
|
int r3)
|
||||||
|
{
|
||||||
|
uint groupId = get_local_id(1);
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
ushort slid = get_sub_group_local_id();
|
||||||
|
|
||||||
|
__private uint4 regA;
|
||||||
|
__private half2 regS;
|
||||||
|
__private float8 regB;
|
||||||
|
|
||||||
|
__private float2 totalSum = (float2)(0.0f);
|
||||||
|
|
||||||
|
// loop along K in block granularity, skip 4 blocks every iter
|
||||||
|
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||||
|
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||||
|
// first 4 fibers in each wave load 8 B values to its private scope
|
||||||
|
if (slid < 4) {
|
||||||
|
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||||
|
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
// load half weights for two blocks in consecutive rows
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduction in local memory, assumes #wave=4
|
||||||
|
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||||
|
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||||
|
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||||
|
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||||
|
|
||||||
|
// 2 outputs per fiber in wave 0
|
||||||
|
if (groupId == 0) {
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,271 @@
|
|||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
// assume
|
||||||
|
#define QK4_0 32
|
||||||
|
#define N_SIMDGROUP 4
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||||
|
float shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||||
|
float8 shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
__kernel void kernel_gemv_noshuffle(
|
||||||
|
__read_only image1d_buffer_t src0_q, // quantized A
|
||||||
|
global half2 * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B
|
||||||
|
ulong offset1, // offset to B (0)
|
||||||
|
global float * dst, // C
|
||||||
|
ulong offsetd, // offset to C (0)
|
||||||
|
int ne00, // K
|
||||||
|
int ne01, // M
|
||||||
|
int ne02, // 1
|
||||||
|
int ne10, // K
|
||||||
|
int ne12, // 1
|
||||||
|
int ne0, // M
|
||||||
|
int ne1, // N
|
||||||
|
int r2, // 1
|
||||||
|
int r3)
|
||||||
|
{
|
||||||
|
uint groupId = get_local_id(1);
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
ushort slid = get_sub_group_local_id();
|
||||||
|
|
||||||
|
uint K = ne00;
|
||||||
|
uint M = ne01;
|
||||||
|
|
||||||
|
uint LINE_STRIDE_A = M / 2;
|
||||||
|
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
|
||||||
|
|
||||||
|
__private uint4 regA;
|
||||||
|
__private half2 regS;
|
||||||
|
__private float8 regB;
|
||||||
|
|
||||||
|
__private float2 totalSum = (float2)(0.0f);
|
||||||
|
|
||||||
|
// loop along K in block granularity, skip 4 blocks every iter
|
||||||
|
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||||
|
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||||
|
// first 4 fibers in each wave load 8 B values to its private scope
|
||||||
|
if (slid < 4) {
|
||||||
|
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||||
|
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
// load half weights for two blocks in consecutive rows
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduction in local memory, assumes #wave=4
|
||||||
|
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||||
|
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||||
|
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||||
|
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||||
|
|
||||||
|
// 2 outputs per fiber in wave 0
|
||||||
|
if (groupId == 0) {
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
1225
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
Normal file
1225
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
Normal file
File diff suppressed because it is too large
Load Diff
130
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
Normal file
130
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
// src0_q, src0_d, src1 are transposed as a preprocessing step
|
||||||
|
// 4-bit weights are transposed in groups of 4 (unsigned short int)
|
||||||
|
// consider weights originally "next to each other", now "on top of each other"
|
||||||
|
// each fiber computes a 8x4 tile of output elements
|
||||||
|
// using unshuffled weights
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
||||||
|
global const ushort * src0_q, // quantized A
|
||||||
|
global const half * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B (1d image)
|
||||||
|
global float * dst, // C
|
||||||
|
int m, // M
|
||||||
|
int n, // N with padding
|
||||||
|
int k, // K
|
||||||
|
int n_no_padding // N without padding
|
||||||
|
) {
|
||||||
|
|
||||||
|
int m_4 = m >> 2;
|
||||||
|
int n_4 = n >> 2;
|
||||||
|
|
||||||
|
int gy = get_global_id(0);
|
||||||
|
int gx = get_global_id(1);
|
||||||
|
int gx_2 = gx << 2;
|
||||||
|
|
||||||
|
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
|
||||||
|
half8 B; // registers for activations
|
||||||
|
half4 dequantized_weights; // registers for dequantized weights
|
||||||
|
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
|
||||||
|
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
|
||||||
|
|
||||||
|
for(int i=0; i<k; i+=4){ //loop through K dimension
|
||||||
|
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
|
||||||
|
|
||||||
|
// keep (i/4) and (i/32) in parenthesis, rounds down
|
||||||
|
// load 4 consecutive groups of 4 weights
|
||||||
|
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
|
||||||
|
|
||||||
|
// load 4 consecutive scales
|
||||||
|
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
|
||||||
|
|
||||||
|
// j=0
|
||||||
|
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=1
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=2
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=3
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
|
||||||
|
|
||||||
|
// conditional check if store is to a valid location. Required when N is not a multiple of 8
|
||||||
|
// if statements allow registers to be reused for each store
|
||||||
|
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||||
|
}
|
||||||
|
}
|
32
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
Normal file
32
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// 16-bit transpose, loading/storing an 8x8 tile of elements
|
||||||
|
|
||||||
|
kernel void kernel_transpose_16(
|
||||||
|
__read_only image1d_buffer_t input,
|
||||||
|
__write_only image1d_buffer_t output,
|
||||||
|
const uint rows,
|
||||||
|
const uint cols
|
||||||
|
) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_3 = i<<3;
|
||||||
|
const int j_3 = j<<3;
|
||||||
|
|
||||||
|
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
|
||||||
|
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
|
||||||
|
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
|
||||||
|
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
|
||||||
|
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
|
||||||
|
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
|
||||||
|
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
|
||||||
|
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
|
||||||
|
|
||||||
|
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
|
||||||
|
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
|
||||||
|
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
|
||||||
|
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
|
||||||
|
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
|
||||||
|
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
|
||||||
|
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
|
||||||
|
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
|
||||||
|
}
|
25
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
Normal file
25
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||||
|
|
||||||
|
kernel void kernel_transpose_32(
|
||||||
|
__read_only image1d_buffer_t input,
|
||||||
|
__write_only image1d_buffer_t output,
|
||||||
|
const uint rows,
|
||||||
|
const uint cols
|
||||||
|
) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_2 = i<<2;
|
||||||
|
const int j_2 = j<<2;
|
||||||
|
|
||||||
|
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
|
||||||
|
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
|
||||||
|
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
|
||||||
|
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
|
||||||
|
|
||||||
|
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
||||||
|
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||||
|
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||||
|
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||||
|
|
||||||
|
}
|
35
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
Normal file
35
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||||
|
// Only used for activations
|
||||||
|
// converts to FP16
|
||||||
|
// also adds zero padding for non multiple of 8 prompt lengths
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_2 = i<<2;
|
||||||
|
const int j_2 = j<<2;
|
||||||
|
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
|
||||||
|
half4 temp1 = {0,0,0,0};
|
||||||
|
half4 temp2 = {0,0,0,0};
|
||||||
|
half4 temp3 = {0,0,0,0};
|
||||||
|
|
||||||
|
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
|
||||||
|
temp0 = read_imageh(input, (j_2+0)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+1)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp1 = read_imageh(input, (j_2+1)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+2)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp2 = read_imageh(input, (j_2+2)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+3)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp3 = read_imageh(input, (j_2+3)*cols+i);
|
||||||
|
}
|
||||||
|
|
||||||
|
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
|
||||||
|
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||||
|
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||||
|
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user