diff --git a/CMakeLists.txt b/CMakeLists.txt index 88585fb93..31532df91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -429,7 +429,7 @@ if (LLAMA_KOMPUTE) set(spv_file ${source}.spv) add_custom_command( OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} COMMENT "Compiling ${source} to ${source}.spv" ) diff --git a/kompute/common.comp b/kompute/common.comp new file mode 100644 index 000000000..12fc7d8b5 --- /dev/null +++ b/kompute/common.comp @@ -0,0 +1,124 @@ +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models License (SOM), + * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany + * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#extension GL_EXT_shader_16bit_storage: require +#extension GL_EXT_shader_8bit_storage: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_EXT_shader_explicit_arithmetic_types_int8: require +#extension GL_EXT_shader_explicit_arithmetic_types_int16: require +#extension GL_EXT_control_flow_attributes: enable + +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 + +#define GELU_COEF_A 0.044715 +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 + +#ifndef QK_K +#define QK_K 256 +#endif + +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif + +#define BM 128 +#define BN 128 +#define BK 8 +#define TM 8 +#define TN 8 + +#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) +#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) +#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) +#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) + +#define sizeof_block_q4_0 0x12 +#define sizeof_block_q4_1 0x14 +struct block_q4_0 { + float16_t d; + uint8_t qs[QK4_0 / 2]; +}; +struct block_q4_1 { + float16_t d; + float16_t m; + uint8_t qs[QK4_1 / 2]; +}; + +#ifndef QK_K +#define QK_K 256 +#endif + +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif + +struct block_q2_K { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + float16_t d; // super-block scale for quantized scales + float16_t dmin; // super-block scale for quantized mins +}; +// 84 bytes / block + +struct block_q3_K { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#if QK_K == 64 + uint8_t scales[2]; +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + float16_t d; // super-block scale +}; + +#if QK_K == 64 +typedef struct { + float16_t d[2]; // super-block scales/mins + uint8_t scales[2]; + uint8_t qs[QK_K/2]; // 4-bit quants +} block_q4_K; +#else +struct block_q4_K { + float16_t d; // super-block scale for quantized scales + float16_t dmin; // super-block scale for quantized mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +}; +#endif + +#if QK_K == 64 +struct block_q5_K { + float16_t d; // super-block scales/mins + int8_t scales[QK_K/16]; // 8-bit block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +}; +#else +struct block_q5_K { + float16_t d; // super-block scale for quantized scales + float16_t dmin; // super-block scale for quantized mins + uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +}; +// 176 bytes / block +#endif + +struct block_q6_K { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + float16_t d; // super-block scale +}; +// 210 bytes / block diff --git a/kompute/op_add.comp b/kompute/op_add.comp index 7e4e43d75..019a68449 100644 --- a/kompute/op_add.comp +++ b/kompute/op_add.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp index 492f672e5..926c929e4 100644 --- a/kompute/op_addrow.comp +++ b/kompute/op_addrow.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp index 40d756ae5..5f425ae28 100644 --- a/kompute/op_cpy_f16_f16.comp +++ b/kompute/op_cpy_f16_f16.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 32 #define IN_TYPE float16_t diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp index 309c48aed..4298bebdd 100644 --- a/kompute/op_cpy_f16_f32.comp +++ b/kompute/op_cpy_f16_f32.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 32 #define IN_TYPE float16_t diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp index fb0e00d67..2d763edfd 100644 --- a/kompute/op_cpy_f32_f16.comp +++ b/kompute/op_cpy_f32_f16.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 32 #define IN_TYPE float diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp index f43480b8d..4e5b1d393 100644 --- a/kompute/op_cpy_f32_f32.comp +++ b/kompute/op_cpy_f32_f32.comp @@ -1,121 +1,6 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 32 #define IN_TYPE float diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp index 18b0192d7..8dc2cc60a 100644 --- a/kompute/op_diagmask.comp +++ b/kompute/op_diagmask.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp index 8079b8ef2..c9f8ce3cf 100644 --- a/kompute/op_gelu.comp +++ b/kompute/op_gelu.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp index e0f5bb16e..17b478b5e 100644 --- a/kompute/op_getrows_f16.comp +++ b/kompute/op_getrows_f16.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp index cddba929b..590f218e6 100644 --- a/kompute/op_getrows_q4_0.comp +++ b/kompute/op_getrows_q4_0.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp index 151848a9d..44718c6af 100644 --- a/kompute/op_getrows_q4_1.comp +++ b/kompute/op_getrows_q4_1.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp index 4907015d8..348eae7b3 100644 --- a/kompute/op_mul.comp +++ b/kompute/op_mul.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp index f1198b593..1390c00cf 100644 --- a/kompute/op_mul_mat_f16.comp +++ b/kompute/op_mul_mat_f16.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 64) in; diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp index 206aea7d5..9b6dd72dc 100644 --- a/kompute/op_mul_mat_q4_0.comp +++ b/kompute/op_mul_mat_q4_0.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 8, local_size_y = 8) in; diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp index 8bdf810a1..fb7b051b8 100644 --- a/kompute/op_mul_mat_q4_1.comp +++ b/kompute/op_mul_mat_q4_1.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 8, local_size_y = 8) in; diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp index 3defd0a5f..498dbdfcd 100644 --- a/kompute/op_mulrow.comp +++ b/kompute/op_mulrow.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp index ec0a8568d..4b2db25e3 100644 --- a/kompute/op_norm.comp +++ b/kompute/op_norm.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 256 diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp index bc2c31f43..41f46be96 100644 --- a/kompute/op_relu.comp +++ b/kompute/op_relu.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp index 784713c36..dd2c5cdde 100644 --- a/kompute/op_rmsnorm.comp +++ b/kompute/op_rmsnorm.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 256 diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp index ca6bb6831..3fa84f579 100644 --- a/kompute/op_rope.comp +++ b/kompute/op_rope.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp index f537121a4..8530aaf3e 100644 --- a/kompute/op_scale.comp +++ b/kompute/op_scale.comp @@ -8,122 +8,8 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable +#include "common.comp" -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block layout(local_size_x = 1) in; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp index 90c034ac7..c5acac281 100644 --- a/kompute/op_silu.comp +++ b/kompute/op_silu.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" layout(local_size_x = 1) in; diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp index ce0e71924..e936d8f68 100644 --- a/kompute/op_softmax.comp +++ b/kompute/op_softmax.comp @@ -8,122 +8,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_control_flow_attributes: enable - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -#define sizeof_block_q4_1 0x14 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; - -#ifndef QK_K -#define QK_K 256 -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -struct block_q2_K { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins -}; -// 84 bytes / block - -struct block_q3_K { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - float16_t d; // super-block scale -}; - -#if QK_K == 64 -typedef struct { - float16_t d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -struct block_q4_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -}; -#endif - -#if QK_K == 64 -struct block_q5_K { - float16_t d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -#else -struct block_q5_K { - float16_t d; // super-block scale for quantized scales - float16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -}; -// 176 bytes / block -#endif - -struct block_q6_K { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -// 210 bytes / block +#include "common.comp" #define nth 32