mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 15:18:26 +01:00
Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
3d03bcb7af
1
.ecrc
1
.ecrc
@ -1,4 +1,5 @@
|
||||
{
|
||||
"Exclude": ["^\\.gitmodules$"],
|
||||
"Disable": {
|
||||
"IndentSize": true
|
||||
}
|
||||
|
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
@ -337,6 +337,7 @@ jobs:
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@ -353,6 +354,8 @@ jobs:
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -361,6 +364,12 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
@ -395,6 +404,15 @@ jobs:
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@ -432,7 +450,8 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "kompute"]
|
||||
path = kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
166
CMakeLists.txt
166
CMakeLists.txt
@ -103,6 +103,7 @@ option(LLAMA_VULKAN "llama: use Vulkan"
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
@ -422,7 +423,13 @@ if (LLAMA_VULKAN)
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
@ -478,7 +485,6 @@ if (LLAMA_HIPBLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
@ -504,6 +510,160 @@ if (LLAMA_SYCL)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_KOMPUTE)
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_f16.comp
|
||||
kompute-shaders/op_rope_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_f16.h
|
||||
shaderop_rope_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
add_compile_definitions(GGML_USE_KOMPUTE)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
@ -848,10 +1008,12 @@ add_library(ggml OBJECT
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
@ -928,7 +1090,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
@ -10,7 +10,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
|
@ -334,7 +334,10 @@ class Params:
|
||||
class BpeVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
try:
|
||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||
except KeyError:
|
||||
self.vocab = self.bpe_tokenizer
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||
|
@ -39,6 +39,17 @@ static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
return f.tellg() == 0;
|
||||
}
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
@ -215,12 +226,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!path_session.empty()) {
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
|
||||
// fopen to check for existing session
|
||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||
if (fp != NULL) {
|
||||
std::fclose(fp);
|
||||
|
||||
if (!file_exists(path_session)) {
|
||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(path_session)) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
@ -229,10 +240,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
} else {
|
||||
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -378,6 +378,8 @@ int main(int argc, char ** argv) {
|
||||
printf("testing %s ...\n", ggml_type_name(type));
|
||||
}
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors) {
|
||||
|
@ -25,6 +25,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
@ -36,7 +37,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
@ -32,6 +32,7 @@ Command line options:
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
|
||||
## Build
|
||||
|
||||
server is build alongside everything else from the root of the project
|
||||
@ -52,21 +53,23 @@ server is build alongside everything else from the root of the project
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
### Unix-based systems (Linux, macOS, etc.)
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
### Windows:
|
||||
### Windows
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||
|
||||
### Docker:
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
|
||||
@ -120,6 +123,7 @@ node index.js
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
@ -189,14 +193,13 @@ node index.js
|
||||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
### Result JSON:
|
||||
|
||||
* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
### Result JSON
|
||||
|
||||
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"content": "<the token selected by the model>",
|
||||
"probs": [
|
||||
@ -212,6 +215,7 @@ node index.js
|
||||
]
|
||||
},
|
||||
```
|
||||
|
||||
Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
@ -290,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
@ -311,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
}'
|
||||
```
|
||||
|
||||
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
|
||||
|
||||
*Examples:*
|
||||
|
||||
- input as string
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": "hello",
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
- `input` as string array
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": ["hello", "world"],
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
@ -362,6 +401,7 @@ python api_like_OAI.py
|
||||
```
|
||||
|
||||
After running the API server, you can use it in Python by setting the API base URL.
|
||||
|
||||
```python
|
||||
openai.api_base = "http://<Your api-server IP>:port"
|
||||
```
|
||||
|
@ -48,6 +48,7 @@ chat_completion() {
|
||||
top_p: 0.9,
|
||||
n_keep: $n_keep,
|
||||
n_predict: 256,
|
||||
cache_prompt: true,
|
||||
stop: ["\n### Human:"],
|
||||
stream: true
|
||||
}')"
|
||||
|
@ -206,3 +206,18 @@ inline static std::vector<json> format_partial_response_oaicompat(const task_res
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
|
||||
inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
|
||||
{
|
||||
json res =
|
||||
json{
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", "list"},
|
||||
{"usage",
|
||||
json{{"prompt_tokens", 0},
|
||||
{"total_tokens", 0}}},
|
||||
{"data", embeddings}
|
||||
};
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -220,6 +220,7 @@ struct llama_client_slot
|
||||
infill = false;
|
||||
ga_i = 0;
|
||||
n_past_se = 0;
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
@ -1227,7 +1228,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@ -1295,6 +1296,8 @@ struct llama_server_context
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
slot.cache_tokens.clear();
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1364,26 +1367,26 @@ struct llama_server_context
|
||||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
}
|
||||
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
if (slot.ga_n == 1)
|
||||
{
|
||||
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
@ -1429,8 +1432,10 @@ struct llama_server_context
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@ -1591,10 +1596,13 @@ struct llama_server_context
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
int ga_i = slot.ga_i;
|
||||
|
||||
int32_t ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
int32_t ga_w = slot.ga_w;
|
||||
|
||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||
{
|
||||
if (slot.ga_n != 1)
|
||||
@ -1606,7 +1614,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast += 1;
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
@ -1666,6 +1674,7 @@ struct llama_server_context
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch batch_view =
|
||||
{
|
||||
n_tokens,
|
||||
@ -1818,15 +1827,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf("\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@ -2929,6 +2938,66 @@ int main(int argc, char **argv)
|
||||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
json prompt;
|
||||
if (body.count("input") != 0)
|
||||
{
|
||||
prompt = body["input"];
|
||||
// batch
|
||||
if(prompt.is_array()) {
|
||||
json data = json::array();
|
||||
int i = 0;
|
||||
for (const json &elem : prompt) {
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
llama.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
json embedding = json{
|
||||
{"embedding", json_value(result.result_json, "embedding", json::array())},
|
||||
{"index", i++},
|
||||
{"object", "embedding"}
|
||||
};
|
||||
data.push_back(embedding);
|
||||
}
|
||||
json result = format_embeddings_response_oaicompat(body, data);
|
||||
return res.set_content(result.dump(), "application/json; charset=utf-8");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
prompt = "";
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
llama.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
json data = json::array({json{
|
||||
{"embedding", json_value(result.result_json, "embedding", json::array())},
|
||||
{"index", 0},
|
||||
{"object", "embedding"}
|
||||
}}
|
||||
);
|
||||
|
||||
json root = format_embeddings_response_oaicompat(body, data);
|
||||
|
||||
// send the result
|
||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
|
||||
// "Bus error: 10" - this is on macOS, it does not crash on Linux
|
||||
//std::thread t2([&]()
|
||||
|
@ -791,7 +791,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free(*buffers[i]);
|
||||
}
|
||||
free(buffers);
|
||||
free(*buffers);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -373,6 +373,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
||||
ggml_backend_vk_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
||||
ggml_backend_kompute_reg_devices();
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
|
331
ggml-cuda.cu
331
ggml-cuda.cu
@ -191,6 +191,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
@ -505,6 +509,14 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
#define QR3_XXS 8
|
||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*(QK_K/8)];
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
@ -1613,6 +1625,41 @@ static const __device__ uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const __device__ uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
@ -1624,6 +1671,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = {
|
||||
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
||||
};
|
||||
|
||||
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
static const __device__ uint64_t ksigns64[128] = {
|
||||
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
||||
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
||||
0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
||||
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
|
||||
0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
|
||||
0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
||||
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
|
||||
0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
|
||||
0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
||||
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
|
||||
0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
|
||||
0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
||||
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
|
||||
0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
|
||||
0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
||||
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
|
||||
0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
|
||||
0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
||||
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
|
||||
0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
|
||||
0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
||||
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
|
||||
0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
|
||||
0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
||||
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
|
||||
0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
|
||||
0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
||||
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
|
||||
0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
|
||||
0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
||||
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
||||
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
||||
};
|
||||
//#endif
|
||||
|
||||
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||
@ -1690,6 +1774,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * q3 = x[i].qs + 8*ib;
|
||||
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
@ -4313,6 +4425,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||
|
||||
@ -4323,20 +4436,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
||||
int sumi1 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
||||
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
||||
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
||||
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
||||
q8 += 8;
|
||||
}
|
||||
int sumi2 = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
||||
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
||||
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
||||
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
@ -4345,6 +4460,45 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
const uint8_t * q3 = bq2->qs + 8*ib32;
|
||||
const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
|
||||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
int sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
|
||||
const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
|
||||
const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
|
||||
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
||||
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
||||
q8 += 8;
|
||||
aux32 >>= 7;
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
|
||||
return d * sumi;
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
||||
@ -5357,27 +5511,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const half * xi = (const half *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@ -5471,23 +5635,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@ -6381,6 +6548,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
@ -6418,6 +6591,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
default:
|
||||
@ -6451,6 +6626,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
default:
|
||||
@ -6663,6 +6840,15 @@ static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||
@ -7135,69 +7321,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
||||
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
||||
}
|
||||
|
||||
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
||||
@ -8213,6 +8412,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@ -8235,6 +8435,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
@ -8306,6 +8507,9 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@ -9941,19 +10145,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
//GGML_ASSERT(src0->ne[3] == 1);
|
||||
|
||||
const int64_t nb00 = src0->nb[0];
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
const int64_t nb03 = src0->nb[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
//GGML_ASSERT(src1->ne[3] == 1);
|
||||
|
||||
const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
@ -9965,17 +10175,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@ -10934,7 +11146,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
@ -10978,6 +11190,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
|
1990
ggml-kompute.cpp
Normal file
1990
ggml-kompute.cpp
Normal file
File diff suppressed because it is too large
Load Diff
46
ggml-kompute.h
Normal file
46
ggml-kompute.h
Normal file
@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -57,6 +57,9 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(voi
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
85
ggml-metal.m
85
ggml-metal.m
@ -60,6 +60,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||
@ -81,6 +82,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
||||
@ -98,6 +100,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
||||
@ -112,6 +115,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||
@ -126,6 +130,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||
@ -169,6 +174,8 @@ struct ggml_metal_context {
|
||||
|
||||
bool support_simdgroup_reduction;
|
||||
bool support_simdgroup_mm;
|
||||
|
||||
bool should_capture_next_compute;
|
||||
};
|
||||
|
||||
// MSL code
|
||||
@ -355,6 +362,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||
@ -431,6 +440,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
@ -452,6 +462,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
@ -469,6 +480,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@ -483,6 +495,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@ -497,6 +510,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
@ -693,6 +707,20 @@ static bool ggml_metal_graph_compute(
|
||||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
const bool should_capture = ctx->should_capture_next_compute;
|
||||
if (should_capture) {
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||
descriptor.captureObject = ctx->queue;
|
||||
|
||||
NSError * error = nil;
|
||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||
GGML_ASSERT(!"capture failed");
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> command_buffer_builder[n_cb];
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
@ -701,6 +729,7 @@ static bool ggml_metal_graph_compute(
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffer enqueue];
|
||||
}
|
||||
|
||||
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||
|
||||
dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
|
||||
@ -747,9 +776,9 @@ static bool ggml_metal_graph_compute(
|
||||
GGML_ASSERT(!"unsupported op");
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
||||
#endif
|
||||
}
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
@ -1276,6 +1305,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
@ -1404,6 +1434,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
@ -1446,6 +1482,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@ -1540,6 +1581,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
@ -1671,6 +1713,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
@ -1729,6 +1777,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@ -1769,6 +1822,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
@ -2299,9 +2353,9 @@ static bool ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
@ -2323,6 +2377,10 @@ static bool ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2498,6 +2556,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||
size_t max_size = device.maxBufferLength;
|
||||
ggml_backend_metal_free_device();
|
||||
|
||||
return max_size;
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||
|
||||
@ -2516,7 +2584,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||
@ -2690,6 +2758,13 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||
}
|
||||
|
||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
ctx->should_capture_next_compute = true;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
||||
|
274
ggml-metal.metal
274
ggml-metal.metal
@ -2856,6 +2856,12 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
} block_iq3_xxs;
|
||||
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
void kernel_mul_mv_q2_K_f32_impl(
|
||||
@ -4078,6 +4084,42 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
|
||||
constexpr constant static uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
@ -4367,6 +4409,143 @@ kernel void kernel_mul_mv_iq2_xs_f32(
|
||||
kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
||||
threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256);
|
||||
{
|
||||
int nval = 4;
|
||||
int pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
|
||||
nval = 2;
|
||||
pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq3_xxs * xr = x + ibl;
|
||||
device const uint8_t * q3 = xr->qs + 8 * ib;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = db * (0.5f + (aux32 >> 28));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
|
||||
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
|
||||
const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
sumf[row] += d * (sum[0] + sum[1]);
|
||||
|
||||
dh += nb*sizeof(block_iq3_xxs)/2;
|
||||
q3 += nb*sizeof(block_iq3_xxs);
|
||||
gas += nb*sizeof(block_iq3_xxs)/2;
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
|
||||
kernel void kernel_mul_mv_iq3_xxs_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
@ -4684,6 +4863,33 @@ void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * q3 = xb->qs + 8*ib32;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
|
||||
grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
|
||||
signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
device const void * src0,
|
||||
@ -5225,6 +5431,7 @@ template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@ -5263,6 +5470,7 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@ -5313,6 +5521,7 @@ template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mu
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@ -6215,3 +6424,68 @@ kernel void kernel_mul_mv_id_iq2_xs_f32(
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
@ -2125,6 +2125,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
|
||||
GGML_UNUSED(buffer_type);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
||||
static size_t max_size = -1;
|
||||
if (max_size == (size_t)-1) {
|
||||
ggml_cl_init();
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
|
||||
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
|
||||
return ggml_backend_is_cpu(backend);
|
||||
@ -2136,7 +2145,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // TODO: return from device info
|
||||
/* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ NULL,
|
||||
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
||||
/* .is_host = */ NULL,
|
||||
|
721
ggml-quants.c
721
ggml-quants.c
@ -3441,6 +3441,41 @@ static const uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
@ -3507,6 +3542,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== 3.0625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * scales_and_signs = qs + QK_K/4;
|
||||
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t));
|
||||
const float db = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
qs += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
@ -8458,17 +8525,36 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||
const __m128i m1 = _mm_set1_epi8(1);
|
||||
const __m128i m511 = _mm_set1_epi16(511);
|
||||
const __m128i m127 = _mm_set1_epi16(127);
|
||||
const __m256i m511 = _mm256_set1_epi16(511);
|
||||
const __m256i mone = _mm256_set1_epi8(1);
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
static const uint8_t k_bit_helper[32] = {
|
||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||
};
|
||||
static const char block_sign_shuffle_mask_1[32] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||
};
|
||||
static const char block_sign_shuffle_mask_2[32] = {
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
||||
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
||||
};
|
||||
static const uint8_t bit_selector_mask_bytes[32] = {
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
};
|
||||
|
||||
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
||||
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
||||
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
||||
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
||||
|
||||
uint64_t aux64;
|
||||
|
||||
// somewhat hacky, but gives a significant boost in performance
|
||||
__m128i aux_gindex, aux_sindex;
|
||||
__m256i aux_gindex;
|
||||
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
||||
const uint16_t * sindex = (const uint16_t *)&aux_sindex;
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@ -8483,26 +8569,68 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
||||
|
||||
const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
|
||||
aux_gindex = _mm256_and_si256(q2_data, m511);
|
||||
|
||||
const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
|
||||
const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
|
||||
const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
|
||||
|
||||
const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
||||
const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
|
||||
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
|
||||
aux_gindex = _mm_and_si128(q2_data, m511);
|
||||
aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
|
||||
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
||||
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
||||
const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
|
||||
const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
||||
const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
|
||||
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
|
||||
iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
|
||||
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
|
||||
iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
|
||||
const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
|
||||
iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
|
||||
const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
|
||||
iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
||||
|
||||
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
||||
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
||||
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
|
||||
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
|
||||
|
||||
__m256i signs;
|
||||
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
|
||||
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||
const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
|
||||
const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
|
||||
|
||||
const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
|
||||
const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
|
||||
const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
|
||||
const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
|
||||
|
||||
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
|
||||
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
|
||||
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
|
||||
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
@ -8551,6 +8679,136 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_iq3_xxs * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
uint32_t aux32[2];
|
||||
|
||||
ggml_int8x16x4_t q3s;
|
||||
ggml_int8x16x4_t q8b;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
float sumf1 = 0, sumf2 = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
||||
const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
|
||||
const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
|
||||
const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
|
||||
const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
|
||||
q3 += 16;
|
||||
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
||||
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
||||
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
||||
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
||||
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
||||
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
||||
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
||||
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
||||
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
||||
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
||||
}
|
||||
sumf += d*(sumf1 + sumf2);
|
||||
}
|
||||
*s = 0.5f * sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
uint32_t aux32[2];
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
||||
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||
q3 += 8;
|
||||
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
||||
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||
q3 += 8;
|
||||
memcpy(aux32, gas, 8); gas += 8;
|
||||
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
||||
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
||||
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
||||
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||
const uint16_t ls1 = aux32[0] >> 28;
|
||||
const uint16_t ls2 = aux32[1] >> 28;
|
||||
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
||||
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
||||
sumi1 = _mm256_add_epi32(sumi1, p1);
|
||||
sumi2 = _mm256_add_epi32(sumi2, p2);
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
|
||||
}
|
||||
|
||||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================ IQ2 quantization =============================================
|
||||
|
||||
typedef struct {
|
||||
@ -9189,3 +9447,436 @@ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, i
|
||||
return nrow * nblock * sizeof(block_iq2_xs);
|
||||
}
|
||||
|
||||
//
|
||||
// ============================================= 3-bit using D4 lattice
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
uint32_t * grid;
|
||||
int * map;
|
||||
uint16_t * neighbours;
|
||||
} iq3_entry_t;
|
||||
|
||||
static iq3_entry_t iq3_data[1] = {
|
||||
{NULL, NULL, NULL},
|
||||
};
|
||||
|
||||
static inline int iq3_data_index(int grid_size) {
|
||||
(void)grid_size;
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int iq3_compare_func(const void * left, const void * right) {
|
||||
const int * l = (const int *)left;
|
||||
const int * r = (const int *)right;
|
||||
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
||||
}
|
||||
|
||||
void iq3xs_init_impl(int grid_size) {
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
if (iq3_data[gindex].grid) {
|
||||
return;
|
||||
}
|
||||
static const uint16_t kgrid_256[256] = {
|
||||
0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
|
||||
81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
|
||||
169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
|
||||
327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
|
||||
536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
|
||||
698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
|
||||
992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
|
||||
1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
|
||||
1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
|
||||
1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
|
||||
1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
|
||||
2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
|
||||
2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
|
||||
2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
|
||||
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
||||
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
||||
};
|
||||
const int kmap_size = 4096;
|
||||
const int nwant = 2;
|
||||
const uint16_t * kgrid = kgrid_256;
|
||||
uint32_t * kgrid_q3xs;
|
||||
int * kmap_q3xs;
|
||||
uint16_t * kneighbors_q3xs;
|
||||
|
||||
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
||||
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
||||
for (int k = 0; k < grid_size; ++k) {
|
||||
int8_t * pos = (int8_t *)(the_grid + k);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = (kgrid[k] >> 3*i) & 0x7;
|
||||
pos[i] = 2*l + 1;
|
||||
}
|
||||
}
|
||||
kgrid_q3xs = the_grid;
|
||||
iq3_data[gindex].grid = the_grid;
|
||||
kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
|
||||
iq3_data[gindex].map = kmap_q3xs;
|
||||
for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
|
||||
uint32_t aux32;
|
||||
uint8_t * aux8 = (uint8_t *)&aux32;
|
||||
for (int i = 0; i < grid_size; ++i) {
|
||||
aux32 = kgrid_q3xs[i];
|
||||
uint16_t index = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
uint16_t q = (aux8[k] - 1)/2;
|
||||
index |= (q << 3*k);
|
||||
}
|
||||
kmap_q3xs[index] = i;
|
||||
}
|
||||
int8_t pos[4];
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
int num_neighbors = 0, num_not_in_map = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
num_neighbors += n;
|
||||
}
|
||||
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
||||
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
||||
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
||||
int counter = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
kmap_q3xs[i] = -(counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q3xs[counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q3xs[counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
}
|
||||
free(dist2);
|
||||
}
|
||||
|
||||
void iq3xs_free_impl(int grid_size) {
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
if (iq3_data[gindex].grid) {
|
||||
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
||||
free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
|
||||
free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_d2 = FLT_MAX;
|
||||
int grid_index = -1;
|
||||
for (int j = 1; j <= num_neighbors; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
||||
float d2 = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float q = pg[i];
|
||||
float diff = scale*q - xval[i];
|
||||
d2 += weight[i]*diff*diff;
|
||||
}
|
||||
if (d2 < best_d2) {
|
||||
best_d2 = d2; grid_index = neighbours[j];
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(grid_index >= 0);
|
||||
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
||||
for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||
|
||||
const int gindex = iq3_data_index(256);
|
||||
|
||||
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
||||
const int * kmap_q3xs = iq3_data[gindex].map;
|
||||
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
||||
|
||||
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 8;
|
||||
|
||||
const int nbl = n/256;
|
||||
|
||||
block_iq3_xxs * y = vy;
|
||||
|
||||
float scales[QK_K/32];
|
||||
float weight[32];
|
||||
float xval[32];
|
||||
int8_t L[32];
|
||||
int8_t Laux[32];
|
||||
float waux[32];
|
||||
bool is_on_grid[8];
|
||||
bool is_on_grid_aux[8];
|
||||
uint8_t block_signs[8];
|
||||
uint8_t q3[3*(QK_K/8)];
|
||||
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
||||
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||
|
||||
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||
memset(q3, 0, 3*QK_K/8);
|
||||
|
||||
float max_scale = 0;
|
||||
|
||||
const float * xbl = x + QK_K*ibl;
|
||||
float sumx2 = 0;
|
||||
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||
float sigma2 = sumx2/QK_K;
|
||||
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
||||
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int nflip = 0;
|
||||
uint8_t s = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
||||
else {
|
||||
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
||||
}
|
||||
}
|
||||
if (nflip%2) {
|
||||
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
||||
if (ax < min) {
|
||||
min = ax; imin = i;
|
||||
}
|
||||
}
|
||||
xval[8*k+imin] = -xval[8*k+imin];
|
||||
s ^= (1 << imin);
|
||||
}
|
||||
block_signs[k] = s & 127;
|
||||
}
|
||||
float max = xval[0];
|
||||
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
||||
if (!max) {
|
||||
scales[ib] = 0;
|
||||
memset(L, 0, 32);
|
||||
continue;
|
||||
}
|
||||
float best = 0;
|
||||
float scale = max/(2*kMaxQ-1);
|
||||
for (int is = -15; is <= 15; ++is) {
|
||||
float id = (2*kMaxQ-1+is*0.2f)/max;
|
||||
float this_scale = 1/id;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
||||
}
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
is_on_grid_aux[k] = true;
|
||||
if (grid_index < 0) {
|
||||
is_on_grid_aux[k] = false;
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
||||
}
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*Laux[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
scale = sumqx/sumq2; best = scale*sumqx;
|
||||
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
||||
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
||||
}
|
||||
}
|
||||
int n_not_ongrid = 0;
|
||||
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
||||
if (n_not_ongrid > 0 && scale > 0) {
|
||||
float id = 1/scale;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
if (is_on_grid[k]) continue;
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
l = MAX(0, MIN(kMaxQ-1, l));
|
||||
u |= (l << 3*i);
|
||||
}
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
||||
}
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
||||
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*L[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0) scale = sumqx/sumq2;
|
||||
}
|
||||
if (scale < 0) {
|
||||
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
||||
// and correspondingly flip quant signs.
|
||||
scale = -scale;
|
||||
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
||||
}
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
q3[8*ib+k] = grid_index;
|
||||
}
|
||||
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
||||
GGML_ASSERT(scale >= 0);
|
||||
scales[ib] = scale;
|
||||
max_scale = MAX(max_scale, scale);
|
||||
}
|
||||
|
||||
if (!max_scale) {
|
||||
memset(y[ibl].qs, 0, 3*QK_K/8);
|
||||
continue;
|
||||
}
|
||||
|
||||
float d = max_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
float id = 1/d;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
||||
l = MAX(0, MIN(15, l));
|
||||
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
||||
if (false) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
||||
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
const float db = 0.25f * d * (1 + 2*l);
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
||||
const float * xk = xb + 4*k;
|
||||
const float * wk = weight + 4*k;
|
||||
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
||||
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
||||
float best_mse = 0; int best_index = q3[8*ib+k];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
best_mse += wk[j] * diff * diff;
|
||||
}
|
||||
for (int idx = 0; idx < 256; ++idx) {
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
||||
float mse = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
mse += wk[j] * diff * diff;
|
||||
}
|
||||
if (mse < best_mse) {
|
||||
best_mse = mse; best_index = idx;
|
||||
}
|
||||
}
|
||||
q3[8*ib+k] = best_index;
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float q = db * grid[j] * signs[j];
|
||||
sumqx += wk[j] * q * xk[j];
|
||||
sumq2 += wk[j] * q * q;
|
||||
}
|
||||
}
|
||||
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
||||
}
|
||||
}
|
||||
memcpy(y[ibl].qs, q3, 3*QK_K/8);
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq3_xxs);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq3_xxs);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_xxs * restrict y = vy;
|
||||
quantize_row_iq3_xxs_reference(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
||||
}
|
||||
|
@ -166,7 +166,7 @@ typedef struct {
|
||||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
// (Almost) "true" 2-bit quantization.
|
||||
// Due to the need to use blocks as per ggml dsign, it ends up using
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
@ -182,6 +182,15 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
// (Almost) "true" 3-bit quantization.
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||
@ -196,6 +205,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||
@ -210,6 +220,7 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||
@ -227,6 +238,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
@ -242,12 +254,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
//
|
||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
@ -260,3 +274,5 @@ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row,
|
||||
|
||||
void iq2xs_init_impl(int grid_size);
|
||||
void iq2xs_free_impl(int grid_size);
|
||||
void iq3xs_init_impl(int grid_size);
|
||||
void iq3xs_free_impl(int grid_size);
|
||||
|
417
ggml-vulkan.cpp
417
ggml-vulkan.cpp
@ -116,7 +116,7 @@ struct vk_device {
|
||||
vk_queue transfer_queue;
|
||||
uint32_t descriptor_set_mode;
|
||||
uint32_t subgroup_size;
|
||||
bool is_igpu;
|
||||
bool uma;
|
||||
};
|
||||
|
||||
struct vk_op_push_constants {
|
||||
@ -675,7 +675,7 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties();
|
||||
|
||||
uint32_t memory_type_index = uint32_t(~0);
|
||||
uint32_t memory_type_index = UINT32_MAX;
|
||||
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[i];
|
||||
@ -685,7 +685,18 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
try {
|
||||
buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index });
|
||||
} catch (const vk::SystemError& e) {
|
||||
// Out of Host/Device memory, clean up buffer
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
buf.size = 0;
|
||||
throw e;
|
||||
}
|
||||
buf.memory_property_flags = req_flags;
|
||||
buf.ptr = nullptr;
|
||||
|
||||
@ -700,6 +711,47 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
return buf;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(size, req_flags);
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_device(size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
} catch (const vk::SystemError& e) {
|
||||
if (vk_device.uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||
return { buf, 0, VK_WHOLE_SIZE };
|
||||
}
|
||||
@ -738,19 +790,6 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector<vk::E
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static bool ggml_vk_build_shader(ggml_type type) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_F16:
|
||||
@ -1015,7 +1054,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
|
||||
vk_device.vendor_id = vk_device.properties.vendorID;
|
||||
vk_device.subgroup_size = subgroup_props.subgroupSize;
|
||||
vk_device.is_igpu = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
vk_device.uma = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
@ -1088,7 +1127,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
if (vk_device.fp16) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | uma: " << vk_device.uma << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
device_queue_create_infos,
|
||||
@ -1210,7 +1249,7 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) {
|
||||
ggml_vk_destroy_buffer(b);
|
||||
}
|
||||
|
||||
return ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
return ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||
@ -1250,10 +1289,6 @@ static void * ggml_vk_host_malloc(size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||
#endif
|
||||
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
|
||||
if(!(buf.memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
@ -1298,6 +1333,20 @@ static void ggml_vk_host_free(void* ptr) {
|
||||
vk_pinned_memory.erase(vk_pinned_memory.begin() + index);
|
||||
}
|
||||
|
||||
static void ggml_vk_host_get(const void * ptr, vk_buffer *& buf, size_t& buf_offset) {
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (ptr >= addr && ptr < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)ptr) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(q);
|
||||
@ -1384,6 +1433,13 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
||||
}
|
||||
}
|
||||
|
||||
static void ensure_sync_staging_buffer(size_t size) {
|
||||
if (vk_sync_staging.size < size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
||||
@ -1391,21 +1447,13 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
||||
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (tensor->data >= addr && tensor->data < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)tensor->data) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(tensor->data, buf, buf_offset);
|
||||
|
||||
const uint64_t ne0 = tensor->ne[0];
|
||||
const uint64_t ne1 = tensor->ne[1];
|
||||
@ -1463,10 +1511,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
@ -1513,16 +1558,8 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (src >= addr && src < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)src) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(src, buf, buf_offset);
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
@ -1555,10 +1592,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
||||
const size_t copy_size = width*height;
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
@ -1634,16 +1668,8 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
||||
GGML_ASSERT(src->size > 0);
|
||||
// Check if dst is pinned memory
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (dst >= addr && dst < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)dst) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(dst, buf, buf_offset);
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == spitch && width == dpitch) {
|
||||
@ -1677,10 +1703,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
} else {
|
||||
@ -1819,7 +1842,7 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")";
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")";
|
||||
#endif
|
||||
if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
||||
#ifdef VK_DEBUG
|
||||
@ -2003,8 +2026,27 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
||||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@ -2034,32 +2076,24 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
||||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
||||
vk_buffer * d_Qx;
|
||||
uint64_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if (!src0_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
@ -2178,8 +2212,27 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
||||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@ -2199,31 +2252,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
||||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
uint32_t qx_buf_offset = 0;
|
||||
vk_buffer* d_Qy;
|
||||
uint32_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
@ -2345,7 +2390,21 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
@ -2355,22 +2414,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
@ -2430,7 +2482,21 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
@ -2441,18 +2507,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
||||
const uint64_t qy_sz = ggml_nbytes(src1);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
@ -2709,7 +2768,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor);
|
||||
#endif
|
||||
|
||||
template<typename PC>
|
||||
@ -2758,17 +2818,34 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
return;
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
vk_buffer * d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer * d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_X, x_buf_offset);
|
||||
src0_uma = d_X != nullptr;
|
||||
if (use_src1) {
|
||||
ggml_vk_host_get(src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
@ -2778,20 +2855,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
||||
vk_buffer* d_X = nullptr;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y = nullptr;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (transfer_src0) {
|
||||
d_X = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src0_uma) {
|
||||
d_X = &extra_src0->buffer_gpu;
|
||||
x_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_X != nullptr);
|
||||
}
|
||||
if (transfer_src1) {
|
||||
d_Y = &vk_prealloc_qy;
|
||||
} else if (use_src1) {
|
||||
} else if (use_src1 && !src1_uma) {
|
||||
d_Y = &extra_src1->buffer_gpu;
|
||||
y_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
@ -3148,13 +3221,13 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size
|
||||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
||||
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
||||
@ -3315,6 +3388,10 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
@ -3376,7 +3453,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3)
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue);
|
||||
ggml_vk_ctx_begin(ctx);
|
||||
|
||||
vk_buffer buffer = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor));
|
||||
|
||||
@ -3439,7 +3516,7 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) {
|
||||
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
||||
#endif
|
||||
// Check transfers are correct
|
||||
vk_buffer buffer = ggml_vk_create_buffer(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
float * x;
|
||||
float * y;
|
||||
@ -3666,7 +3743,7 @@ void ggml_vk_preallocate_buffers() {
|
||||
std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl;
|
||||
#endif
|
||||
#if defined(VK_RUN_TESTS)
|
||||
vk_staging = ggml_vk_create_buffer(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ggml_vk_test_transfer(8192 * 1000, false);
|
||||
ggml_vk_test_transfer(8192 * 1000, true);
|
||||
|
||||
@ -3712,42 +3789,42 @@ void ggml_vk_preallocate_buffers() {
|
||||
if (vk_prealloc_qx.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qx);
|
||||
}
|
||||
vk_prealloc_qx = ggml_vk_create_buffer(vk_prealloc_size_qx, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx);
|
||||
}
|
||||
if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_qy.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qy);
|
||||
}
|
||||
vk_prealloc_qy = ggml_vk_create_buffer(vk_prealloc_size_qy, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy);
|
||||
}
|
||||
if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_x.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_x);
|
||||
}
|
||||
vk_prealloc_x = ggml_vk_create_buffer(vk_prealloc_size_x, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x);
|
||||
}
|
||||
if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_y.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_y);
|
||||
}
|
||||
vk_prealloc_y = ggml_vk_create_buffer(vk_prealloc_size_y, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y);
|
||||
}
|
||||
if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(vk_prealloc_size_split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k);
|
||||
}
|
||||
if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) {
|
||||
// Resize buffer
|
||||
if (vk_staging.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_staging);
|
||||
}
|
||||
vk_staging = ggml_vk_create_buffer(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4138,6 +4215,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
||||
|
||||
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@ -4163,14 +4241,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
if (extra->offset + ggml_nbytes(tensor) > extra->buffer_gpu.size) {
|
||||
std::cerr << "ERROR: Trying to assign tensor " << tensor << " outside of buffer size " << ctx->dev_buffer.size << " requested offset: " << extra->offset << " tensor size: " << ggml_nbytes(tensor) << std::endl;
|
||||
if (tensor->view_src != nullptr) {
|
||||
std::cerr << "view_src: " << tensor->view_src << " extra: " << tensor->view_src->extra << std::endl;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
@ -4248,7 +4318,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
||||
#endif
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer_device(size);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer);
|
||||
|
||||
@ -4326,9 +4396,12 @@ GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffe
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_vk_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
void * ptr = nullptr;
|
||||
try {
|
||||
ptr = ggml_vk_host_malloc(size);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
@ -4389,7 +4462,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@ -4409,7 +4482,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@ -4429,7 +4502,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
||||
#endif
|
||||
if (dst->buffer->buft == ggml_backend_vk_buffer_type() && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
@ -4499,7 +4572,6 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
|
||||
bool ok = ggml_vk_compute_forward(¶ms, node);
|
||||
if (!ok) {
|
||||
std::cerr << "Vulkan disable: " << vk_disable << std::endl;
|
||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
@ -4665,7 +4737,7 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||
// checks
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
|
||||
return;
|
||||
}
|
||||
@ -4683,10 +4755,14 @@ void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const gg
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
@ -4698,9 +4774,9 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
||||
if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
|
||||
float val;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
val = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
@ -4711,14 +4787,16 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||
@ -4730,10 +4808,10 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 0, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 1, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
@ -4743,7 +4821,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
@ -4779,7 +4857,7 @@ void * comp_result;
|
||||
size_t comp_size;
|
||||
size_t comp_nb[GGML_MAX_DIMS];
|
||||
size_t check_counter = 0;
|
||||
void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
@ -4796,8 +4874,9 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
.mem_size = 1024*1024*1024,
|
||||
.mem_buffer = NULL,
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx = ggml_init(iparams);
|
||||
@ -4829,7 +4908,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
||||
const int idx = i3*src0->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4842,7 +4921,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
if (offset + src0_size >= extra->buffer_gpu.size) {
|
||||
src0_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@ -4872,7 +4951,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
||||
const int idx = i3*src1->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4885,7 +4964,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
if (offset + src1_size >= extra->buffer_gpu.size) {
|
||||
src1_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@ -4969,7 +5048,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
tensor_clone = ggml_dup(ctx, src0_clone);
|
||||
tensor_clone->type == tensor->type;
|
||||
tensor_clone->type = tensor->type;
|
||||
} else {
|
||||
tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone);
|
||||
}
|
||||
@ -5046,7 +5125,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
tensor_size = extra->buffer_gpu.size - (extra->offset);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
float first_error_result = -1.0f;
|
||||
|
185
ggml.c
185
ggml.c
@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
break;
|
||||
}
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
return NULL;
|
||||
}
|
||||
return aligned_memory;
|
||||
@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = malloc(size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// calloc
|
||||
inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
if (num == 0 || size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = calloc(num, size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#define GGML_MALLOC(size) ggml_malloc(size)
|
||||
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
||||
|
||||
#define GGML_FREE(ptr) free(ptr)
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
@ -599,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_IQ3_XXS] = {
|
||||
.type_name = "iq3_xxs",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq3_xxs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
||||
.from_float = quantize_row_iq3_xxs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
.type_name = "q8_K",
|
||||
.blck_size = QK_K,
|
||||
@ -2206,6 +2250,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
@ -7652,6 +7697,7 @@ static void ggml_compute_forward_add(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
@ -7918,6 +7964,7 @@ static void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
@ -8037,6 +8084,7 @@ static void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@ -10788,6 +10836,7 @@ static void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
@ -10967,6 +11016,7 @@ static void ggml_compute_forward_set(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@ -11163,6 +11213,7 @@ static void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
||||
} break;
|
||||
@ -11810,6 +11861,7 @@ static void ggml_compute_forward_alibi(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
@ -11886,6 +11938,7 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
@ -15439,13 +15492,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
size = ggml_hash_size(size);
|
||||
struct ggml_hash_set result;
|
||||
result.size = size;
|
||||
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
||||
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
||||
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
||||
free(hash_set.keys);
|
||||
GGML_FREE(hash_set.keys);
|
||||
}
|
||||
|
||||
struct hash_map {
|
||||
@ -15454,17 +15507,17 @@ struct hash_map {
|
||||
};
|
||||
|
||||
static struct hash_map * ggml_new_hash_map(size_t size) {
|
||||
struct hash_map * result = malloc(sizeof(struct hash_map));
|
||||
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
||||
result->set = ggml_hash_set_new(size);
|
||||
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_map_free(struct hash_map * map) {
|
||||
ggml_hash_set_free(map->set);
|
||||
free(map->vals);
|
||||
free(map);
|
||||
GGML_FREE(map->vals);
|
||||
GGML_FREE(map);
|
||||
}
|
||||
|
||||
// gradient checkpointing
|
||||
@ -19145,6 +19198,7 @@ void ggml_quantize_init(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
||||
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
||||
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
||||
default: // nothing
|
||||
break;
|
||||
}
|
||||
@ -19407,6 +19461,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
@ -19533,6 +19596,25 @@ struct gguf_context {
|
||||
void * data;
|
||||
};
|
||||
|
||||
static size_t gguf_type_size(enum gguf_type type) {
|
||||
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
||||
return GGUF_TYPE_SIZE[type];
|
||||
}
|
||||
|
||||
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
||||
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
||||
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
||||
|
||||
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
||||
GGML_ASSERT(info->ne[i] > 0);
|
||||
}
|
||||
|
||||
// prevent overflow for total number of elements
|
||||
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
||||
}
|
||||
|
||||
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
||||
const size_t n = fread(dst, 1, size, file);
|
||||
*offset += n;
|
||||
@ -19545,7 +19627,16 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
|
||||
bool ok = true;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
||||
|
||||
// early exit if string length is invalid, prevents from integer overflow
|
||||
if (p->n == SIZE_MAX) {
|
||||
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
||||
return false;
|
||||
}
|
||||
|
||||
p->data = GGML_CALLOC(p->n + 1, 1);
|
||||
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
@ -19618,6 +19709,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// sanity-checks to prevent from integer/buffer overflows
|
||||
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
||||
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
fclose(file);
|
||||
@ -19628,7 +19725,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
@ -19671,21 +19768,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
@ -19703,7 +19818,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the tensor infos
|
||||
{
|
||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
@ -19714,12 +19829,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
|
||||
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
||||
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
gguf_tensor_info_sanitize(info);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
@ -19873,12 +19994,12 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
if (kv->key.data) {
|
||||
free(kv->key.data);
|
||||
GGML_FREE(kv->key.data);
|
||||
}
|
||||
|
||||
if (kv->type == GGUF_TYPE_STRING) {
|
||||
if (kv->value.str.data) {
|
||||
free(kv->value.str.data);
|
||||
GGML_FREE(kv->value.str.data);
|
||||
}
|
||||
}
|
||||
|
||||
@ -19888,16 +20009,16 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
||||
if (str->data) {
|
||||
free(str->data);
|
||||
GGML_FREE(str->data);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(kv->value.arr.data);
|
||||
GGML_FREE(kv->value.arr.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->kv);
|
||||
GGML_FREE(ctx->kv);
|
||||
}
|
||||
|
||||
if (ctx->infos) {
|
||||
@ -19905,11 +20026,11 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
if (info->name.data) {
|
||||
free(info->name.data);
|
||||
GGML_FREE(info->name.data);
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->infos);
|
||||
GGML_FREE(ctx->infos);
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
@ -20210,8 +20331,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = type;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
||||
}
|
||||
|
||||
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
||||
@ -20220,7 +20341,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
||||
for (int i = 0; i < n; i++) {
|
||||
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||
str->n = strlen(data[i]);
|
||||
@ -20247,19 +20368,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||
}
|
||||
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||
free((void *)data);
|
||||
GGML_FREE((void *)data);
|
||||
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||
GGML_ASSERT(false && "nested arrays not supported");
|
||||
} else {
|
||||
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -20335,7 +20456,7 @@ struct gguf_buf {
|
||||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
||||
/*buf.size =*/ size,
|
||||
/*buf.offset =*/ 0,
|
||||
};
|
||||
@ -20345,7 +20466,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
free(buf.data);
|
||||
GGML_FREE(buf.data);
|
||||
}
|
||||
}
|
||||
|
||||
@ -20426,7 +20547,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
@ -20435,10 +20556,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
}
|
||||
|
||||
|
2
ggml.h
2
ggml.h
@ -353,6 +353,7 @@ extern "C" {
|
||||
GGML_TYPE_Q8_K = 15,
|
||||
GGML_TYPE_IQ2_XXS = 16,
|
||||
GGML_TYPE_IQ2_XS = 17,
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@ -389,6 +390,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
1
kompute
Submodule
1
kompute
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306
|
102
kompute-shaders/common.comp
Normal file
102
kompute-shaders/common.comp
Normal file
@ -0,0 +1,102 @@
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||
#extension GL_EXT_control_flow_attributes: enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_1 32
|
||||
|
||||
#define GELU_COEF_A 0.044715
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
|
||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||
|
||||
#define sizeof_block_q4_0 0x12
|
||||
struct block_q4_0 {
|
||||
float16_t d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.f * xb.d;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_1 0x14
|
||||
struct block_q4_1 {
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb.m;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
float16_t d; // super-block scale
|
||||
};
|
||||
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||
const float16_t d_all = xb.d;
|
||||
|
||||
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||
const float16_t dl = float16_t(d_all * sc * coef);
|
||||
mat4 reg;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
|
||||
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
// struct block_q8_0 {
|
||||
// float16_t d; // delta
|
||||
// int8_t qs[QK8_0]; // quants
|
||||
// };
|
||||
#define sizeof_block_q8_0 34
|
58
kompute-shaders/op_add.comp
Normal file
58
kompute-shaders/op_add.comp
Normal file
@ -0,0 +1,58 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
|
||||
} pcs;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
int offs = 0; // TMP (see above)
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
25
kompute-shaders/op_addrow.comp
Normal file
25
kompute-shaders/op_addrow.comp
Normal file
@ -0,0 +1,25 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
uint row;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f16_f16.comp
Normal file
52
kompute-shaders/op_cpy_f16_f16.comp
Normal file
@ -0,0 +1,52 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f16_f32.comp
Normal file
52
kompute-shaders/op_cpy_f16_f32.comp
Normal file
@ -0,0 +1,52 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f32_f16.comp
Normal file
52
kompute-shaders/op_cpy_f32_f16.comp
Normal file
@ -0,0 +1,52 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f32_f32.comp
Normal file
52
kompute-shaders/op_cpy_f32_f32.comp
Normal file
@ -0,0 +1,52 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
30
kompute-shaders/op_diagmask.comp
Normal file
30
kompute-shaders/op_diagmask.comp
Normal file
@ -0,0 +1,30 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint n_past;
|
||||
int ne00;
|
||||
int ne01;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i02 = gl_WorkGroupID.z;
|
||||
const uint i01 = gl_WorkGroupID.y;
|
||||
const uint i00 = gl_WorkGroupID.x;
|
||||
|
||||
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
|
||||
|
||||
if (i00 > pcs.n_past + i01) {
|
||||
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
|
||||
} else {
|
||||
out_[index + pcs.outOff] = in_[index + pcs.inOff];
|
||||
}
|
||||
}
|
22
kompute-shaders/op_gelu.comp
Normal file
22
kompute-shaders/op_gelu.comp
Normal file
@ -0,0 +1,22 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
|
||||
}
|
||||
}
|
17
kompute-shaders/op_getrows.comp
Normal file
17
kompute-shaders/op_getrows.comp
Normal file
@ -0,0 +1,17 @@
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
int z = 0;
|
||||
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||
for (uint j = 0; j < 4; ++j) {
|
||||
for (uint k = 0; k < 4; ++k) {
|
||||
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||
out_[outIndex] = result[j][k];
|
||||
++z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
31
kompute-shaders/op_getrows_f16.comp
Normal file
31
kompute-shaders/op_getrows_f16.comp
Normal file
@ -0,0 +1,31 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
38
kompute-shaders/op_getrows_q4_0.comp
Normal file
38
kompute-shaders/op_getrows_q4_0.comp
Normal file
@ -0,0 +1,38 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||
block_q4_0 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||
fres.qs[it] = inA[index+2+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||
return dequantize_q4_0(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
39
kompute-shaders/op_getrows_q4_1.comp
Normal file
39
kompute-shaders/op_getrows_q4_1.comp
Normal file
@ -0,0 +1,39 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
fres.m = u8BufToFloat16(inA, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = inA[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||
return dequantize_q4_1(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
44
kompute-shaders/op_getrows_q6_k.comp
Normal file
44
kompute-shaders/op_getrows_q6_k.comp
Normal file
@ -0,0 +1,44 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 16
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q6_k get_unaligned_block_q6_k(uint index) {
|
||||
block_q6_k fres;
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
|
||||
fres.ql[it] = inA[index + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
|
||||
fres.qh[it] = inA[index + QK_K/2 + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
|
||||
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
|
||||
}
|
||||
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q6_k block = get_unaligned_block_q6_k(index);
|
||||
return dequantize_q6_k(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
52
kompute-shaders/op_mul.comp
Normal file
52
kompute-shaders/op_mul.comp
Normal file
@ -0,0 +1,52 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
67
kompute-shaders/op_mul_mat_f16.comp
Normal file
67
kompute-shaders/op_mul_mat_f16.comp
Normal file
@ -0,0 +1,67 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint rb = gl_WorkGroupID.y*N_F16_F32;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
|
||||
|
||||
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
||||
|
||||
for (uint row = 0; row < N_F16_F32; ++row) {
|
||||
uint r1 = rb + row;
|
||||
if (r1 >= pcs.ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf = 0;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
51
kompute-shaders/op_mul_mat_mat_f32.comp
Normal file
51
kompute-shaders/op_mul_mat_mat_f32.comp
Normal file
@ -0,0 +1,51 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
// device subgroup size
|
||||
layout (local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
}
|
||||
pcs;
|
||||
|
||||
|
||||
void main() {
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
|
||||
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
|
||||
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
|
||||
|
||||
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
|
||||
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
float sum = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sum += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sum);
|
||||
if (subgroupElect()) {
|
||||
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
33
kompute-shaders/op_mul_mat_q4_0.comp
Normal file
33
kompute-shaders/op_mul_mat_q4_0.comp
Normal file
@ -0,0 +1,33 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
35
kompute-shaders/op_mul_mat_q4_1.comp
Normal file
35
kompute-shaders/op_mul_mat_q4_1.comp
Normal file
@ -0,0 +1,35 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
94
kompute-shaders/op_mul_mat_q6_k.comp
Normal file
94
kompute-shaders/op_mul_mat_q6_k.comp
Normal file
@ -0,0 +1,94 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y_id = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int gqa;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint8_t kmask1 = uint8_t(0x03);
|
||||
const uint8_t kmask2 = uint8_t(0x0C);
|
||||
const uint8_t kmask3 = uint8_t(0x30);
|
||||
const uint8_t kmask4 = uint8_t(0xC0);
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint r2 = gl_WorkGroupID.z;
|
||||
|
||||
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
||||
const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
|
||||
const uint x = row * nb + offset0; // Based from inA without base offset
|
||||
const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// bits of invocation ID for gl_SubgroupSize=32:
|
||||
// x x x x x
|
||||
// 4 3 2 1 0
|
||||
// ( tid ) ix
|
||||
// ip ( il )
|
||||
|
||||
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
|
||||
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
|
||||
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
|
||||
const uint ip = tid/8; // first or second half of block (0 or 1)
|
||||
const uint il = tid%8; // each half has 8 parts, one per scale
|
||||
const uint n = 4; // 4 scales at a time (and 4 sums)
|
||||
const uint l0 = n*il; // offset into half-block, 0..28
|
||||
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
const uint y_offset = 128*ip + l0;
|
||||
const uint q_offset_l = 64*ip + l0;
|
||||
const uint q_offset_h = 32*ip + l0;
|
||||
|
||||
for (uint i = ix; i < nb; i += block_stride) {
|
||||
|
||||
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
|
||||
const uint qlIndex = q_offset_l;
|
||||
const uint q2Index = qlIndex + QK_K/8;
|
||||
const uint qhIndex = q_offset_h;
|
||||
const uint y = yy + i * QK_K + y_offset;
|
||||
|
||||
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (uint l = 0; l < n; ++l) {
|
||||
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
|
||||
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
|
||||
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
|
||||
|
||||
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
|
||||
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
|
||||
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
|
||||
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
|
||||
}
|
||||
|
||||
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
|
||||
}
|
||||
|
||||
const float tot = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
73
kompute-shaders/op_mul_mat_q8_0.comp
Normal file
73
kompute-shaders/op_mul_mat_q8_0.comp
Normal file
@ -0,0 +1,73 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
#define SIZE_OF_D 2
|
||||
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = pcs.ne00/QK8_0;
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
|
||||
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID.x/4;
|
||||
const uint il = gl_SubgroupInvocationID.x%4;
|
||||
|
||||
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (uint ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = inB[yb + i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
|
||||
sumq += qs_iq * yl[iq];
|
||||
}
|
||||
const float16_t d = u8BufToFloat16(inA, x + block_offset);
|
||||
sumf[row] += sumq*d;
|
||||
}
|
||||
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect() && first_row + row < pcs.ne01) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
48
kompute-shaders/op_mul_mv_q_n.comp
Normal file
48
kompute-shaders/op_mul_mv_q_n.comp
Normal file
@ -0,0 +1,48 @@
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0; // Based from inA without base offset
|
||||
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 16) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
const uint block_index = x + ib + row * nb;
|
||||
sumf[row] += block_q_n_dot_y(block_index, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
22
kompute-shaders/op_mul_mv_q_n_pre.comp
Normal file
22
kompute-shaders/op_mul_mv_q_n_pre.comp
Normal file
@ -0,0 +1,22 @@
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne10;
|
||||
int ne12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
84
kompute-shaders/op_norm.comp
Normal file
84
kompute-shaders/op_norm.comp
Normal file
@ -0,0 +1,84 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
// MEAN
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] *= scale;
|
||||
}
|
||||
}
|
21
kompute-shaders/op_relu.comp
Normal file
21
kompute-shaders/op_relu.comp
Normal file
@ -0,0 +1,21 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
|
||||
}
|
||||
}
|
53
kompute-shaders/op_rmsnorm.comp
Normal file
53
kompute-shaders/op_rmsnorm.comp
Normal file
@ -0,0 +1,53 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 512) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] * scale;
|
||||
}
|
||||
}
|
73
kompute-shaders/op_rope_f16.comp
Normal file
73
kompute-shaders/op_rope_f16.comp
Normal file
@ -0,0 +1,73 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
const bool is_neox = (pcs.mode & 2) != 0;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
const int p = inB[pcs.inBOff + i2];
|
||||
|
||||
float theta = float(p);
|
||||
|
||||
if (!is_neox) {
|
||||
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+1]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
} else {
|
||||
const float inv_ndims = -1.f/pcs.n_dims;
|
||||
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
||||
const uint cur_rot = ic;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint i0 = ic/2;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+pcs.n_dims/2]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
|
||||
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
||||
const uint i0 = ic;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data + 0] = inA[src + 0];
|
||||
out_[dst_data + 1] = inA[src + 1];
|
||||
}
|
||||
}
|
||||
}
|
73
kompute-shaders/op_rope_f32.comp
Normal file
73
kompute-shaders/op_rope_f32.comp
Normal file
@ -0,0 +1,73 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
const bool is_neox = (pcs.mode & 2) != 0;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
const int p = inB[pcs.inBOff + i2];
|
||||
|
||||
float theta = float(p);
|
||||
|
||||
if (!is_neox) {
|
||||
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+1];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
} else {
|
||||
const float inv_ndims = -1.f/pcs.n_dims;
|
||||
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
||||
const uint cur_rot = ic;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint i0 = ic/2;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+pcs.n_dims/2];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
||||
const uint i0 = ic;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data + 0] = inA[src + 0];
|
||||
out_[dst_data + 1] = inA[src + 1];
|
||||
}
|
||||
}
|
||||
}
|
19
kompute-shaders/op_scale.comp
Normal file
19
kompute-shaders/op_scale.comp
Normal file
@ -0,0 +1,19 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
23
kompute-shaders/op_scale_8.comp
Normal file
23
kompute-shaders/op_scale_8.comp
Normal file
@ -0,0 +1,23 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
}
|
22
kompute-shaders/op_silu.comp
Normal file
22
kompute-shaders/op_silu.comp
Normal file
@ -0,0 +1,22 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
|
||||
}
|
||||
}
|
56
kompute-shaders/op_softmax.comp
Normal file
56
kompute-shaders/op_softmax.comp
Normal file
@ -0,0 +1,56 @@
|
||||
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
|
||||
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
float scale;
|
||||
int mask;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
|
||||
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
|
||||
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
// parallel max
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
|
||||
}
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
67
kompute-shaders/rope_common.comp
Normal file
67
kompute-shaders/rope_common.comp
Normal file
@ -0,0 +1,67 @@
|
||||
#include "common.comp"
|
||||
|
||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_orig_ctx;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const float i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
|
||||
out float cos_theta, out float sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||
return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
void rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||
}
|
63
llama.cpp
63
llama.cpp
@ -15,6 +15,8 @@
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
@ -1160,10 +1162,10 @@ struct llama_mlock {
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) const {
|
||||
@ -1315,6 +1317,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
@ -2362,6 +2369,7 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
@ -2710,6 +2718,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XSS - 3.0625 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@ -4109,7 +4118,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||
|
||||
@ -4130,6 +4139,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
if (params.n_gpu_layers > 0 && (
|
||||
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
||||
|| !(
|
||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
)) {
|
||||
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
||||
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
||||
params.n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
@ -9250,6 +9275,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name == "token_embd.weight") {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||
@ -9260,7 +9292,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
||||
++qs.i_ffn_down;
|
||||
}
|
||||
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
@ -9268,6 +9299,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
@ -9305,6 +9339,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
// if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
||||
//}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
||||
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
||||
@ -9336,13 +9373,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
@ -9385,7 +9423,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ3_XXS) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
@ -9399,6 +9438,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
@ -9440,6 +9480,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
@ -10295,6 +10336,16 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
|
4
llama.h
4
llama.h
@ -49,7 +49,8 @@
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
@ -111,6 +112,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -1 +1 @@
|
||||
f2a9472b23cf27e672ed70a2a6eb078f7b060f18
|
||||
475cbad5c1c834e31e26a2283bc1413181644360
|
||||
|
@ -370,12 +370,15 @@ struct test_case {
|
||||
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// check if backends support op
|
||||
// check if the backends support the ops
|
||||
bool supported = true;
|
||||
for (ggml_backend_t backend : {backend1, backend2}) {
|
||||
if (!ggml_backend_supports_op(backend, out)) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (!ggml_backend_supports_op(backend, t)) {
|
||||
printf("not supported [%s] ", ggml_backend_name(backend));
|
||||
supported = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!supported) {
|
||||
@ -626,6 +629,13 @@ struct test_unary : public test_case {
|
||||
ggml_tensor * out = ggml_unary(ctx, in, op);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
// test extended range of values to check for NaNs in GELU
|
||||
init_tensor_uniform(t, -150.f, 150.f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GET_ROWS
|
||||
@ -1066,18 +1076,24 @@ struct test_diag_mask_inf : public test_case {
|
||||
struct test_soft_max : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float scale;
|
||||
const bool mask;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
return VARS_TO_STR4(type, ne, scale, mask);
|
||||
}
|
||||
|
||||
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
||||
: type(type), ne(ne) {}
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
float scale = 1.0f,
|
||||
bool mask = false)
|
||||
: type(type), ne(ne), scale(scale), mask(mask) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_soft_max(ctx, a);
|
||||
ggml_tensor * b = nullptr;
|
||||
if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); }
|
||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@ -1544,6 +1560,393 @@ struct test_moe : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
};
|
||||
|
||||
struct llama_hparams {
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_embd;
|
||||
uint32_t n_head;
|
||||
uint32_t n_head_kv;
|
||||
static constexpr uint32_t n_layer = 1;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head; // dimension of values (d_v)
|
||||
uint32_t n_ff;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
// cparams
|
||||
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
||||
static constexpr uint32_t n_orig_ctx = n_ctx;
|
||||
|
||||
// batch
|
||||
int32_t n_tokens;
|
||||
|
||||
// llm_build_context
|
||||
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
||||
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
||||
|
||||
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
return n_embd_head * n_head_kv;
|
||||
}
|
||||
};
|
||||
|
||||
// LLM base class
|
||||
struct test_llm : public test_case {
|
||||
llama_hparams hp;
|
||||
|
||||
protected:
|
||||
test_llm(llama_hparams hp)
|
||||
: hp(std::move(hp)) {
|
||||
}
|
||||
|
||||
public:
|
||||
struct ggml_tensor * llm_build_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * mw,
|
||||
struct ggml_tensor * mb,
|
||||
llm_norm_type type) {
|
||||
switch (type) {
|
||||
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
|
||||
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
|
||||
}
|
||||
cur = ggml_mul(ctx, cur, mw);
|
||||
if (mb) {
|
||||
cur = ggml_add(ctx, cur, mb);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k_l,
|
||||
struct ggml_tensor * v_l,
|
||||
struct ggml_tensor * k_cur,
|
||||
struct ggml_tensor * v_cur) {
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
|
||||
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
|
||||
( hp.n_ctx)*ggml_element_size(v_l),
|
||||
(hp.kv_head)*ggml_element_size(v_l));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_cpy(ctx, k_cur, k_cache_view);
|
||||
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
||||
}
|
||||
|
||||
// if max_alibi_bias > 0 then apply ALiBi
|
||||
struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k_l,
|
||||
struct ggml_tensor * v_l,
|
||||
struct ggml_tensor * q_cur,
|
||||
struct ggml_tensor * kq_mask,
|
||||
float kq_scale) {
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, k_l,
|
||||
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
|
||||
ggml_row_size(k_l->type, hp.n_embd_gqa()),
|
||||
ggml_row_size(k_l->type, hp.n_embd_head),
|
||||
0);
|
||||
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
||||
|
||||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, v_l,
|
||||
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
|
||||
ggml_element_size(v_l)*hp.n_ctx,
|
||||
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
|
||||
0);
|
||||
|
||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
||||
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
|
||||
|
||||
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
||||
cur = ggml_mul_mat(ctx, wo, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// pos
|
||||
std::vector<int> data(hp.n_tokens);
|
||||
for (int i = 0; i < hp.n_tokens; i++) {
|
||||
data[i] = rand() % hp.n_ctx;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Llama
|
||||
struct test_llama : public test_llm {
|
||||
static constexpr float freq_base = 10000.0f;
|
||||
static constexpr float freq_scale = 1.0f;
|
||||
static constexpr float ext_factor = 0.0f;
|
||||
static constexpr float attn_factor = 1.0f;
|
||||
static constexpr float beta_fast = 32.0f;
|
||||
static constexpr float beta_slow = 1.0f;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "LLAMA";
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
auto n_tokens = hp.n_tokens;
|
||||
return VARS_TO_STR1(n_tokens);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 2e-3;
|
||||
}
|
||||
|
||||
test_llama(int n_tokens = 1)
|
||||
: test_llm({
|
||||
/*n_vocab =*/ 32000,
|
||||
/*n_embd =*/ 3200,
|
||||
/*n_head =*/ 32,
|
||||
/*n_head_kv =*/ 32,
|
||||
/*n_rot =*/ 100,
|
||||
/*n_embd_head =*/ 100,
|
||||
/*n_ff =*/ 8640,
|
||||
/*f_norm_eps =*/ 0.f,
|
||||
/*f_norm_rms_eps =*/ 1e-5f,
|
||||
/*n_tokens =*/ n_tokens,
|
||||
}) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
|
||||
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
||||
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
||||
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
|
||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
|
||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
||||
|
||||
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
|
||||
|
||||
// feed-forward network
|
||||
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
||||
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
|
||||
cur = ggml_mul_mat(ctx, ffn_gate, cur);
|
||||
cur = ggml_silu(ctx, cur);
|
||||
cur = ggml_mul(ctx, cur, tmp);
|
||||
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
||||
|
||||
cur = ggml_add(ctx, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
// lm_head
|
||||
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
|
||||
cur = ggml_mul_mat(ctx, output, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
// Falcon
|
||||
struct test_falcon : public test_llm {
|
||||
static constexpr float freq_base = 10000.0f;
|
||||
static constexpr float freq_scale = 1.0f;
|
||||
static constexpr float ext_factor = 0.0f;
|
||||
static constexpr float attn_factor = 1.0f;
|
||||
static constexpr float beta_fast = 32.0f;
|
||||
static constexpr float beta_slow = 1.0f;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "FALCON";
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
auto n_tokens = hp.n_tokens;
|
||||
return VARS_TO_STR1(n_tokens);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 2e-3;
|
||||
}
|
||||
|
||||
test_falcon(int n_tokens = 1)
|
||||
: test_llm({
|
||||
/*n_vocab =*/ 32000,
|
||||
/*n_embd =*/ 3200,
|
||||
/*n_head =*/ 50,
|
||||
/*n_head_kv =*/ 1,
|
||||
/*n_rot =*/ 64,
|
||||
/*n_embd_head =*/ 64,
|
||||
/*n_ff =*/ 8640,
|
||||
/*f_norm_eps =*/ 1e-5f,
|
||||
/*f_norm_rms_eps =*/ 0.f,
|
||||
/*n_tokens =*/ n_tokens,
|
||||
}) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
|
||||
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
||||
// norm
|
||||
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = attn_norm;
|
||||
|
||||
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
|
||||
|
||||
cur = ggml_mul_mat(ctx, wqkv, cur);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
||||
|
||||
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = cur;
|
||||
|
||||
// feed forward
|
||||
{
|
||||
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
||||
cur = attn_norm;
|
||||
cur = ggml_mul_mat(ctx, ffn_up, cur);
|
||||
cur = ggml_gelu(ctx, cur);
|
||||
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx, cur, ffn_inp);
|
||||
|
||||
cur = ggml_add(ctx, cur, inpL);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
|
||||
|
||||
// lm_head
|
||||
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
|
||||
cur = ggml_mul_mat(ctx, output, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
std::default_random_engine rng(0);
|
||||
@ -1557,6 +1960,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
@ -1593,8 +1997,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
|
||||
for (ggml_type type : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
@ -1696,6 +2102,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
exponent <<= 1;
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
|
||||
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
|
||||
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
|
||||
@ -1756,6 +2165,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||
#endif
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
test_cases.emplace_back(new test_llama(1));
|
||||
test_cases.emplace_back(new test_llama(2));
|
||||
test_cases.emplace_back(new test_falcon(1));
|
||||
test_cases.emplace_back(new test_falcon(2));
|
||||
#endif
|
||||
|
||||
// run tests
|
||||
if (mode == MODE_TEST) {
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
|
@ -1,3 +1,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
int main(void) {}
|
||||
|
@ -17,7 +17,9 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
||||
|
||||
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
||||
@ -135,18 +137,21 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
|
||||
const ggml_type ei = (ggml_type)i;
|
||||
|
||||
if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
|
||||
printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
ggml_quantize_init(ei);
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
@ -161,7 +166,9 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
||||
|
@ -278,6 +278,8 @@ int main(int argc, char * argv[]) {
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
printf("%s\n", ggml_type_name(type));
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
||||
if (params.op_quantize_row_q_reference) {
|
||||
printf(" quantize_row_q_reference\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
|
Loading…
Reference in New Issue
Block a user