Add q6_k getrows and mul*vec kernel.

This commit is contained in:
Adam Treat 2023-10-02 09:05:22 -04:00 committed by cebtenzzre
parent 4b223ec432
commit f1c9bc1821
4 changed files with 218 additions and 1 deletions

View File

@ -25,9 +25,11 @@
#include "shaderop_mul_mat_f16.h"
#include "shaderop_mul_mat_q4_0.h"
#include "shaderop_mul_mat_q4_1.h"
#include "shaderop_mul_mat_q6_k.h"
#include "shaderop_getrows_f16.h"
#include "shaderop_getrows_q4_0.h"
#include "shaderop_getrows_q4_1.h"
#include "shaderop_getrows_q6_k.h"
#include "shaderop_rope.h"
#include "shaderop_cpy_f16_f16.h"
#include "shaderop_cpy_f16_f32.h"
@ -52,6 +54,7 @@
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define QK_NL 16
typedef ggml_fp16_t half;
struct ggml_kompute_context {
@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) {
ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
}
void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
const std::shared_ptr<kp::Tensor>& inA,
const std::shared_ptr<kp::Tensor>& inB,
const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
struct PushConstants {
uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne10, ne0, ne1, ne01, gqa;
} pushConsts {
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne10, ne0, ne1, ne01, ne12/ne02
};
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) {
// const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
} else {
s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({inA, inB, out});
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
s_algo->setPushConstants<PushConstants>({pushConsts});
s_algo->updateDescriptors(s_kompute_context->pool.get());
}
seq.record<kp::OpAlgoDispatch>(s_algo);
}
void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
unsigned element_size, unsigned qk,
kp::Sequence& seq,
@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
}
template <typename... Args>
void ggml_vk_get_rows_q6_k(Args&&... args) {
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
kp::shader_data::op_getrows_q6_k_comp_spv_len);
ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
}
void ggml_vk_rope(kp::Sequence& seq,
const std::shared_ptr<kp::Tensor>& in,
const std::shared_ptr<kp::Tensor>& out,
@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
case GGML_TYPE_Q4_1:
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
break;
case GGML_TYPE_Q6_K:
ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
break;
default: {
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
goto not_implemented;
@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else if (src0t == GGML_TYPE_Q4_1) {
ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else if (src0t == GGML_TYPE_Q6_K) {
ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else {
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
goto not_implemented;

View File

@ -0,0 +1,52 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#include "common.comp"
#define NL 16
#define BYTES_FOR_TYPE 4 /*bytes for float*/
#define SIZE_OF_BLOCK sizeof_block_q6_k
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
block_q6_k get_unaligned_block_q6_k(uint index) {
block_q6_k fres;
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
fres.ql[it] = inA[index + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
fres.qh[it] = inA[index + QK_K/2 + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
}
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
return fres;
}
mat4 dequantize_block(uint index, uint il) {
const block_q6_k block = get_unaligned_block_q6_k(index);
return dequantize_q6_k(block, il);
}
#include "op_getrows.comp"

View File

@ -0,0 +1,117 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#include "common.comp"
#define SIZE_OF_BLOCK sizeof_block_q6_k
layout(local_size_x_id = 0) in;
layout(local_size_y_id = 1) in;
layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne10;
int ne0;
int ne1;
int ne01;
int gqa;
} pcs;
block_q6_k get_unaligned_block_q6_k(uint index) {
block_q6_k fres;
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
fres.ql[it] = inA[index + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
fres.qh[it] = inA[index + QK_K/2 + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
}
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
return fres;
}
void main() {
const uint8_t kmask1 = uint8_t(0x03);
const uint8_t kmask2 = uint8_t(0x0C);
const uint8_t kmask3 = uint8_t(0x30);
const uint8_t kmask4 = uint8_t(0xC0);
const int nb = pcs.ne00/QK_K;
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint r2 = gl_WorkGroupID.z;
const uint row = 2 * r0 + gl_SubgroupID;
const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
const uint x = row * nb + offset0; // Based from inA without base offset
const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
float sumf = 0;
const uint tid = gl_SubgroupInvocationID/2;
const uint ix = gl_SubgroupInvocationID%2;
const uint ip = tid/8; // 0 or 1
const uint il = tid%8;
const uint n = 4;
const uint l0 = n*il;
const uint is = 8*ip + l0/16;
const uint y_offset = 128*ip + l0;
const uint q_offset_l = 64*ip + l0;
const uint q_offset_h = 32*ip + l0;
for (uint i = ix; i < nb; i += 2) {
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
// const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
// const block_q6_k block = get_unaligned_block_q6_k(index);
const uint qlIndex = q_offset_l;
const uint q2Index = qlIndex + 32;
const uint qhIndex = q_offset_h;
const uint y = yy + i * QK_K + y_offset;
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (uint l = 0; l < n; ++l) {
// const uint8_t currentQ1 = block.ql[qlIndex + l];
// const uint8_t currentQ2 = block.ql[q2Index + l];
// const uint8_t currentQh = block.qh[qhIndex + l];
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
const uint8_t currentQh = inA[baseIndex + qhIndex + l];
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
}
// sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
}
const float tot = subgroupAdd(sumf);
if (subgroupElect()) {
out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
}
}

View File

@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model(
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
// this allocates all Vulkan resources and memory buffers
ctx->ctx_kompute = ggml_vk_init();