From f1c9bc18216606b992a4b13b4154ddf97e443a92 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:05:22 -0400
Subject: [PATCH] Add q6_k getrows and mul*vec kernel.

---
 ggml-vulkan.cpp              |  47 ++++++++++++++
 kompute/op_getrows_q6_k.comp |  52 ++++++++++++++++
 kompute/op_mul_mat_q6_k.comp | 117 +++++++++++++++++++++++++++++++++++
 llama.cpp                    |   3 +-
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 kompute/op_getrows_q6_k.comp
 create mode 100644 kompute/op_mul_mat_q6_k.comp
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f770a2d0c..1dd504127 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -25,9 +25,11 @@
 #include "shaderop_mul_mat_f16.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
+#include "shaderop_getrows_q6_k.h"
 #include "shaderop_rope.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
@@ -52,6 +54,7 @@
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
+#define QK_NL 16
 
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
@@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) {
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
+        kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+//        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                       unsigned element_size, unsigned qk,
                       kp::Sequence& seq,
@@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+void ggml_vk_get_rows_q6_k(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
+        kp::shader_data::op_getrows_q6_k_comp_spv_len);
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
+}
+
 void ggml_vk_rope(kp::Sequence& seq,
                   const std::shared_ptr<kp::Tensor>& in,
                   const std::shared_ptr<kp::Tensor>& out,
@@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 case GGML_TYPE_Q4_1:
                                     ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                     break;
+                                case GGML_TYPE_Q6_K:
+                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
                                 default: {
                                     fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                                     goto not_implemented;
@@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_1) {
                             ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0t == GGML_TYPE_Q6_K) {
+                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
                             fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp
new file mode 100644
index 000000000..95817b487
--- /dev/null
+++ b/kompute/op_getrows_q6_k.comp
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define NL 16
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+mat4 dequantize_block(uint index, uint il) {
+    const block_q6_k block = get_unaligned_block_q6_k(index);
+    return dequantize_q6_k(block, il);
+}
+
+#include "op_getrows.comp"
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
new file mode 100644
index 000000000..1e4ea37f8
--- /dev/null
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y_id = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+void main() {
+    const uint8_t kmask1 = uint8_t(0x03);
+    const uint8_t kmask2 = uint8_t(0x0C);
+    const uint8_t kmask3 = uint8_t(0x30);
+    const uint8_t kmask4 = uint8_t(0xC0);
+
+    const int nb = pcs.ne00/QK_K;
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint r2 = gl_WorkGroupID.z;
+
+    const uint row = 2 * r0 + gl_SubgroupID;
+    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
+    const uint x = row * nb + offset0; // Based from inA without base offset
+    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf = 0;
+
+    const uint tid  = gl_SubgroupInvocationID/2;
+    const uint ix   = gl_SubgroupInvocationID%2;
+    const uint ip   = tid/8;         // 0 or 1
+    const uint il   = tid%8;
+    const uint n    = 4;
+    const uint l0   = n*il;
+    const uint is   = 8*ip + l0/16;
+
+    const uint y_offset = 128*ip + l0;
+    const uint q_offset_l = 64*ip + l0;
+    const uint q_offset_h = 32*ip + l0;
+
+    for (uint i = ix; i < nb; i += 2) {
+
+        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const block_q6_k block = get_unaligned_block_q6_k(index);
+
+        const uint qlIndex = q_offset_l;
+        const uint q2Index = qlIndex + 32;
+        const uint qhIndex = q_offset_h;
+        const uint y = yy + i * QK_K + y_offset;
+
+        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (uint l = 0; l < n; ++l) {
+
+//            const uint8_t currentQ1 = block.ql[qlIndex + l];
+//            const uint8_t currentQ2 = block.ql[q2Index + l];
+//            const uint8_t currentQh = block.qh[qhIndex + l];
+            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
+            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
+            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
+
+            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
+            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
+            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
+            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
+        }
+
+//        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
+        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
+        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
+    }
+
+    const float tot = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index 245174898..603f7cc64 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model(
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();