Add q6_k getrows and mul*vec kernel.

2025-01-15 14:50:51 +01:00 · 2023-10-02 09:05:22 -04:00 · 2023-10-02 09:05:22 -04:00 · f1c9bc1821
commit f1c9bc1821
parent 4b223ec432
4 changed files with 218 additions and 1 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -25,9 +25,11 @@
 #include "shaderop_mul_mat_f16.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
 #include "shaderop_rope.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
@ -52,6 +54,7 @@
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
 #define QK_NL 16
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) {
    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
        kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
    struct PushConstants {
        uint32_t inAOff, inBOff, outOff;
        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
    } pushConsts {
        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
        ne00, ne10, ne0, ne1, ne01, ne12/ne02
    };
    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
    if (!komputeManager()->hasAlgorithm(__func__)) {
 //        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
        s_algo->setPushConstants<PushConstants>({pushConsts});
        s_algo->updateDescriptors(s_kompute_context->pool.get());
    }
    seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                      unsigned element_size, unsigned qk,
                      kp::Sequence& seq,
@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 template <typename... Args>
 void ggml_vk_get_rows_q6_k(Args&&... args) {
    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
        kp::shader_data::op_getrows_q6_k_comp_spv_len);
    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 void ggml_vk_rope(kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& in,
                  const std::shared_ptr<kp::Tensor>& out,
@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                case GGML_TYPE_Q4_1:
                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                    break;
                                case GGML_TYPE_Q6_K:
                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                    break;
                                default: {
                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                                    goto not_implemented;
@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                        } else if (src0t == GGML_TYPE_Q4_1) {
                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                        } else if (src0t == GGML_TYPE_Q6_K) {
                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                        } else {
                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                            goto not_implemented;
--- a/kompute/op_getrows_q6_k.comp
+++ b/kompute/op_getrows_q6_k.comp
@ -0,0 +1,52 @@
 /**
 * Copyright (c) 2023 Nomic, Inc. All rights reserved.
 *
 * This software is licensed under the terms of the Software for Open Models License (SOM),
 * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
 * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
 */
 #version 450
 #include "common.comp"
 #define NL 16
 #define BYTES_FOR_TYPE 4 /*bytes for float*/
 #define SIZE_OF_BLOCK sizeof_block_q6_k
 layout(local_size_x = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb01;
    int nb1;
 } pcs;
 block_q6_k get_unaligned_block_q6_k(uint index) {
    block_q6_k fres;
    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
        fres.ql[it] = inA[index + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
        fres.qh[it] = inA[index + QK_K/2 + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
    }
    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
    return fres;
 }
 mat4 dequantize_block(uint index, uint il) {
    const block_q6_k block = get_unaligned_block_q6_k(index);
    return dequantize_q6_k(block, il);
 }
 #include "op_getrows.comp"
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@ -0,0 +1,117 @@
 /**
 * Copyright (c) 2023 Nomic, Inc. All rights reserved.
 *
 * This software is licensed under the terms of the Software for Open Models License (SOM),
 * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
 * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
 */
 #version 450
 #include "common.comp"
 #define SIZE_OF_BLOCK sizeof_block_q6_k
 layout(local_size_x_id = 0) in;
 layout(local_size_y_id = 1) in;
 layout(local_size_z = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int ne10;
    int ne0;
    int ne1;
    int ne01;
    int gqa;
 } pcs;
 block_q6_k get_unaligned_block_q6_k(uint index) {
    block_q6_k fres;
    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
        fres.ql[it] = inA[index + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
        fres.qh[it] = inA[index + QK_K/2 + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
    }
    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
    return fres;
 }
 void main() {
    const uint8_t kmask1 = uint8_t(0x03);
    const uint8_t kmask2 = uint8_t(0x0C);
    const uint8_t kmask3 = uint8_t(0x30);
    const uint8_t kmask4 = uint8_t(0xC0);
    const int nb = pcs.ne00/QK_K;
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint r2 = gl_WorkGroupID.z;
    const uint row = 2 * r0 + gl_SubgroupID;
    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
    const uint x = row * nb + offset0; // Based from inA without base offset
    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
    float sumf = 0;
    const uint tid  = gl_SubgroupInvocationID/2;
    const uint ix   = gl_SubgroupInvocationID%2;
    const uint ip   = tid/8;         // 0 or 1
    const uint il   = tid%8;
    const uint n    = 4;
    const uint l0   = n*il;
    const uint is   = 8*ip + l0/16;
    const uint y_offset = 128*ip + l0;
    const uint q_offset_l = 64*ip + l0;
    const uint q_offset_h = 32*ip + l0;
    for (uint i = ix; i < nb; i += 2) {
        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
 //        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
 //        const block_q6_k block = get_unaligned_block_q6_k(index);
        const uint qlIndex = q_offset_l;
        const uint q2Index = qlIndex + 32;
        const uint qhIndex = q_offset_h;
        const uint y = yy + i * QK_K + y_offset;
        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (uint l = 0; l < n; ++l) {
 //            const uint8_t currentQ1 = block.ql[qlIndex + l];
 //            const uint8_t currentQ2 = block.ql[q2Index + l];
 //            const uint8_t currentQh = block.qh[qhIndex + l];
            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
        }
 //        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
    }
    const float tot = subgroupAdd(sumf);
    if (subgroupElect()) {
        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
    }
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model(
        && (model->ftype == LLAMA_FTYPE_ALL_F32
            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
        // this allocates all Vulkan resources and memory buffers
        ctx->ctx_kompute = ggml_vk_init();