From f1c9bc18216606b992a4b13b4154ddf97e443a92 Mon Sep 17 00:00:00 2001 From: Adam Treat Date: Mon, 2 Oct 2023 09:05:22 -0400 Subject: [PATCH] Add q6_k getrows and mul*vec kernel. --- ggml-vulkan.cpp | 47 ++++++++++++++ kompute/op_getrows_q6_k.comp | 52 ++++++++++++++++ kompute/op_mul_mat_q6_k.comp | 117 +++++++++++++++++++++++++++++++++++ llama.cpp | 3 +- 4 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 kompute/op_getrows_q6_k.comp create mode 100644 kompute/op_mul_mat_q6_k.comp diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f770a2d0c..1dd504127 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -25,9 +25,11 @@ #include "shaderop_mul_mat_f16.h" #include "shaderop_mul_mat_q4_0.h" #include "shaderop_mul_mat_q4_1.h" +#include "shaderop_mul_mat_q6_k.h" #include "shaderop_getrows_f16.h" #include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_1.h" +#include "shaderop_getrows_q6_k.h" #include "shaderop_rope.h" #include "shaderop_cpy_f16_f16.h" #include "shaderop_cpy_f16_f32.h" @@ -52,6 +54,7 @@ #define QK4_0 32 #define QR4_0 2 #define QK4_1 32 +#define QK_NL 16 typedef ggml_fp16_t half; struct ggml_kompute_context { @@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) { ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward(args)...); } +void ggml_vk_mul_mat_q6_k(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, + int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, + kp::shader_data::op_mul_mat_q6_k_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne10, ne0, ne1, ne01, gqa; + } pushConsts { + inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne10, ne0, ne1, ne01, ne12/ne02 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { +// const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + void ggml_vk_get_rows(const std::vector& spirv, unsigned element_size, unsigned qk, kp::Sequence& seq, @@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) { ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward(args)...); } +template +void ggml_vk_get_rows_q6_k(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv, + kp::shader_data::op_getrows_q6_k_comp_spv_len); + ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward(args)...); +} + void ggml_vk_rope(kp::Sequence& seq, const std::shared_ptr& in, const std::shared_ptr& out, @@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph case GGML_TYPE_Q4_1: ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); break; + case GGML_TYPE_Q6_K: + ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; default: { fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); goto not_implemented; @@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); } else if (src0t == GGML_TYPE_Q4_1) { ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_Q6_K) { + ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); } else { fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); goto not_implemented; diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp new file mode 100644 index 000000000..95817b487 --- /dev/null +++ b/kompute/op_getrows_q6_k.comp @@ -0,0 +1,52 @@ +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models License (SOM), + * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany + * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#version 450 + +#include "common.comp" + +#define NL 16 +#define BYTES_FOR_TYPE 4 /*bytes for float*/ +#define SIZE_OF_BLOCK sizeof_block_q6_k + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +block_q6_k get_unaligned_block_q6_k(uint index) { + block_q6_k fres; + [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { + fres.ql[it] = inA[index + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { + fres.qh[it] = inA[index + QK_K/2 + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { + fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); + } + fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); + return fres; +} + +mat4 dequantize_block(uint index, uint il) { + const block_q6_k block = get_unaligned_block_q6_k(index); + return dequantize_q6_k(block, il); +} + +#include "op_getrows.comp" diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp new file mode 100644 index 000000000..1e4ea37f8 --- /dev/null +++ b/kompute/op_mul_mat_q6_k.comp @@ -0,0 +1,117 @@ +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models License (SOM), + * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany + * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#version 450 + +#include "common.comp" + +#define SIZE_OF_BLOCK sizeof_block_q6_k + +layout(local_size_x_id = 0) in; +layout(local_size_y_id = 1) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int gqa; +} pcs; + +block_q6_k get_unaligned_block_q6_k(uint index) { + block_q6_k fres; + [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { + fres.ql[it] = inA[index + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { + fres.qh[it] = inA[index + QK_K/2 + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { + fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); + } + fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); + return fres; +} + +void main() { + const uint8_t kmask1 = uint8_t(0x03); + const uint8_t kmask2 = uint8_t(0x0C); + const uint8_t kmask3 = uint8_t(0x30); + const uint8_t kmask4 = uint8_t(0xC0); + + const int nb = pcs.ne00/QK_K; + + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint r2 = gl_WorkGroupID.z; + + const uint row = 2 * r0 + gl_SubgroupID; + const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0); + const uint x = row * nb + offset0; // Based from inA without base offset + const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + float sumf = 0; + + const uint tid = gl_SubgroupInvocationID/2; + const uint ix = gl_SubgroupInvocationID%2; + const uint ip = tid/8; // 0 or 1 + const uint il = tid%8; + const uint n = 4; + const uint l0 = n*il; + const uint is = 8*ip + l0/16; + + const uint y_offset = 128*ip + l0; + const uint q_offset_l = 64*ip + l0; + const uint q_offset_h = 32*ip + l0; + + for (uint i = ix; i < nb; i += 2) { + + const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff; +// const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff; +// const block_q6_k block = get_unaligned_block_q6_k(index); + + const uint qlIndex = q_offset_l; + const uint q2Index = qlIndex + 32; + const uint qhIndex = q_offset_h; + const uint y = yy + i * QK_K + y_offset; + + float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (uint l = 0; l < n; ++l) { + +// const uint8_t currentQ1 = block.ql[qlIndex + l]; +// const uint8_t currentQ2 = block.ql[q2Index + l]; +// const uint8_t currentQh = block.qh[qhIndex + l]; + const uint8_t currentQ1 = inA[baseIndex + qlIndex + l]; + const uint8_t currentQ2 = inA[baseIndex + q2Index + l]; + const uint8_t currentQh = inA[baseIndex + qhIndex + l]; + + sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32); + sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32); + sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32); + sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32); + } + +// sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]); + float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16); + sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is])); + } + + const float tot = subgroupAdd(sumf); + if (subgroupElect()) { + out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; + } +} diff --git a/llama.cpp b/llama.cpp index 245174898..603f7cc64 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model( && (model->ftype == LLAMA_FTYPE_ALL_F32 || model->ftype == LLAMA_FTYPE_MOSTLY_F16 || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0 - || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) { + || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1 + || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) { // this allocates all Vulkan resources and memory buffers ctx->ctx_kompute = ggml_vk_init();