mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-15 14:50:51 +01:00
Consolidate code for mat x vec kernels and use subgroups more extensively.
This commit is contained in:
parent
77135a3bf5
commit
93306f16d0
@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
|
||||
if (heapSize < memoryRequired)
|
||||
continue;
|
||||
|
||||
vk::PhysicalDeviceSubgroupProperties subgroupProperties;
|
||||
vk::PhysicalDeviceProperties2 deviceProperties2;
|
||||
deviceProperties2.pNext = &subgroupProperties;
|
||||
physicalDevices.at(i).getProperties2(&deviceProperties2);
|
||||
|
||||
if (subgroupProperties.subgroupSize < 32)
|
||||
continue;
|
||||
|
||||
ggml_vk_device d;
|
||||
d.index = i;
|
||||
d.type = properties.deviceType;
|
||||
d.heapSize = heapSize;
|
||||
d.name = properties.deviceName;
|
||||
d.subgroupSize = subgroupProperties.subgroupSize;
|
||||
size_t n_idx = ++count_by_name[d.name];
|
||||
if (n_idx > 1) {
|
||||
d.name += " (" + std::to_string(n_idx) + ")";
|
||||
@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
|
||||
bool ggml_vk_init_device(int device) {
|
||||
komputeManager()->initializeDevice(device, {},
|
||||
{"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
|
||||
"VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
|
||||
"VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
|
||||
return ggml_vk_has_device();
|
||||
}
|
||||
|
||||
@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
|
||||
};
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__))
|
||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
|
||||
else {
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
s_algo->setTensors({in, out});
|
||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
||||
@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
|
||||
};
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__))
|
||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
|
||||
else {
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
s_algo->setTensors({inA, inB, out});
|
||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
|
||||
@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
|
||||
const std::shared_ptr<kp::Tensor>& inB,
|
||||
const std::shared_ptr<kp::Tensor>& out,
|
||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||
int32_t ne00, int32_t ne10, int32_t ne0,
|
||||
int32_t ne01, int32_t ne11) {
|
||||
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
|
||||
int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
|
||||
struct PushConstants {
|
||||
uint32_t inAOff, inBOff, outOff;
|
||||
int32_t ne00, ne10, ne0;
|
||||
int32_t ne00, ne10, ne0, ne1, ne01, gqa;
|
||||
} pushConsts {
|
||||
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||
ne00, ne10, ne0,
|
||||
ne00, ne10, ne0, ne1, ne01, ne12/ne02
|
||||
};
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__))
|
||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
|
||||
else {
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
s_algo->setTensors({inA, inB, out});
|
||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
|
||||
s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
|
||||
s_algo->setPushConstants<PushConstants>({pushConsts});
|
||||
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
||||
}
|
||||
seq.record<kp::OpTensorFill>({out});
|
||||
seq.record<kp::OpAlgoDispatch>(s_algo);
|
||||
}
|
||||
|
||||
@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
|
||||
const uint32_t nb3 = dst ? dst->nb[3] : 0;
|
||||
|
||||
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
||||
// const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||
|
||||
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
||||
@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
|
||||
&& src1->type == GGML_TYPE_F32) {
|
||||
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0
|
||||
&& src1->type == GGML_TYPE_F32) {
|
||||
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1
|
||||
&& src1->type == GGML_TYPE_F32) {
|
||||
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
|
||||
} else {
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
|
||||
if (src1t != GGML_TYPE_F32) {
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||
goto not_implemented;
|
||||
}
|
||||
|
||||
if (!ggml_is_transposed(src0)
|
||||
&& !ggml_is_transposed(src1)
|
||||
&& ne00%32 == 0
|
||||
&& ne11 > 1) {
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||
goto not_implemented;
|
||||
} else {
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
|
||||
break;
|
||||
default: {
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||
goto not_implemented;
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
if (src0t == GGML_TYPE_F16) {
|
||||
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||
} else if (src0->type == GGML_TYPE_Q4_0) {
|
||||
} else if (src0t == GGML_TYPE_Q4_0) {
|
||||
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||
} else if (src0->type == GGML_TYPE_Q4_1) {
|
||||
} else if (src0t == GGML_TYPE_Q4_1) {
|
||||
ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||
} else {
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
|
||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
|
||||
goto not_implemented;
|
||||
}
|
||||
} break;
|
||||
|
@ -34,6 +34,7 @@ struct ggml_vk_device {
|
||||
size_t heapSize = 0;
|
||||
std::string name;
|
||||
std::string vendor;
|
||||
int subgroupSize = 0;
|
||||
};
|
||||
|
||||
std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
|
||||
|
@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
|
||||
const uint nb = k / qk;
|
||||
|
||||
for (uint i = 0; i < nb; i++) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
|
||||
|
||||
const float16_t d = block.d;
|
||||
const float16_t m = block.m;
|
||||
|
@ -10,7 +10,9 @@
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 64) in;
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
|
||||
int ne1;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
@ -39,24 +39,13 @@ void main() {
|
||||
const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
|
||||
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
|
||||
for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
|
||||
float sumf = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
// accumulate the sum from all threads in the threadgroup
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
|
@ -10,7 +10,13 @@
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 8, local_size_y = 8) in;
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int gqa;
|
||||
} pcs;
|
||||
|
||||
shared float sum[64];
|
||||
|
||||
void main() {
|
||||
const uint nb = uint(pcs.ne00/QK4_0);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
|
||||
const uint x = r0*nb; // Based from inA without base offset
|
||||
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
|
||||
|
||||
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
|
||||
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
|
||||
|
||||
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
|
||||
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
|
||||
|
||||
const uint first = 4 * iy;
|
||||
|
||||
float sumf = 0.0;
|
||||
|
||||
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
|
||||
const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
|
||||
const float d = float(u8BufToFloat16(inA, index));
|
||||
|
||||
const uint xl = first; // Based from bl->qs
|
||||
const uint yl = y + i * QK4_0 + first; // Based from inB
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const uint8_t b = inA[index+2+xl+j];
|
||||
acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
|
||||
acc.y += inB[yl+j] + inB[yl+j+16];
|
||||
}
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumf += d * (acc.x - 8.*acc.y);
|
||||
}
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
sum[ith] = sumf;
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
barrier();
|
||||
if (ith == 0) {
|
||||
float sumTotal = 0.0;
|
||||
for (uint i = 0; i < nth; ++i) {
|
||||
sumTotal += sum[i];
|
||||
}
|
||||
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
|
@ -10,7 +10,13 @@
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 8, local_size_y = 8) in;
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int gqa;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
|
||||
|
||||
#define UNALIGNED_INPUT inA
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
|
||||
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = UNALIGNED_INPUT[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint nb = uint(pcs.ne00/QK4_1);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
|
||||
const uint x = r0*nb; // Based from inA without base offset
|
||||
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
|
||||
|
||||
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
|
||||
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
|
||||
|
||||
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
|
||||
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
|
||||
|
||||
const uint first = 4 * iy;
|
||||
|
||||
float sumf = 0.0;
|
||||
|
||||
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
|
||||
//TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
|
||||
|
||||
const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
|
||||
|
||||
const float d = float(block.d);
|
||||
const float m = float(block.m);
|
||||
|
||||
const uint xl = first; // Based from bl->qs
|
||||
const uint yl = y + i * QK4_1 + first; // Based from inB
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
|
||||
acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
|
||||
}
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
sumf += d * (acc.x - acc.y);
|
||||
}
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sum[ith] = sumf;
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
if (ith == 0) {
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
|
49
kompute/op_mul_mv_q_n.comp
Normal file
49
kompute/op_mul_mv_q_n.comp
Normal file
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the terms of the Software for Open Models License (SOM),
|
||||
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
|
||||
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
|
||||
*/
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
void main() {
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
|
||||
|
||||
const uint x = offset0; // Based from inA without base offset
|
||||
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
const uint block_index = x + ib + row * nb;
|
||||
sumf[row] += block_q_n_dot_y(block_index, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
@ -10,9 +10,9 @@
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define nth 32
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x = nth) in;
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
|
||||
int ne02;
|
||||
} pcs;
|
||||
|
||||
shared float buf[nth];
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
@ -37,46 +35,22 @@ void main() {
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
// parallel max
|
||||
buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
||||
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||
localMax = max(localMax, in_[psrc0 + i00]);
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
const float max_ = buf[0];
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
buf[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
||||
buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||
const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
const float sum = buf[0];
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
||||
out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
|
||||
OpAlgoDispatch.cpp
|
||||
OpMemoryBarrier.cpp
|
||||
OpTensorCopy.cpp
|
||||
OpTensorFill.cpp
|
||||
OpTensorSyncDevice.cpp
|
||||
OpTensorSyncLocal.cpp
|
||||
OpBufferSyncDevice.cpp
|
||||
|
55
kompute/src/OpTensorFill.cpp
Normal file
55
kompute/src/OpTensorFill.cpp
Normal file
@ -0,0 +1,55 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/**
|
||||
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the terms of the Software for Open Models License (SOM),
|
||||
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
|
||||
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
|
||||
*/
|
||||
|
||||
#include "kompute/operations/OpTensorFill.hpp"
|
||||
#include "kompute/Tensor.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
|
||||
|
||||
if (tensors.size() < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpTensorFill called with less than 1 tensor");
|
||||
}
|
||||
|
||||
this->mTensors = tensors;
|
||||
}
|
||||
|
||||
OpTensorFill::~OpTensorFill()
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute OpTensorFill record called");
|
||||
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mTensors[i]->recordFill(commandBuffer, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
|
||||
}
|
||||
|
||||
}
|
@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
|
||||
commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
|
||||
uint32_t fill)
|
||||
{
|
||||
commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
||||
vk::AccessFlagBits srcAccessMask,
|
||||
|
@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
|
||||
kompute/operations/OpMemoryBarrier.hpp
|
||||
kompute/operations/OpMult.hpp
|
||||
kompute/operations/OpTensorCopy.hpp
|
||||
kompute/operations/OpTensorFill.hpp
|
||||
kompute/operations/OpTensorSyncDevice.hpp
|
||||
kompute/operations/OpTensorSyncLocal.hpp
|
||||
kompute/operations/OpBufferSyncDevice.hpp
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "operations/OpTensorSyncLocal.hpp"
|
||||
#include "operations/OpBufferSyncDevice.hpp"
|
||||
#include "operations/OpBufferSyncLocal.hpp"
|
||||
#include "operations/OpTensorFill.hpp"
|
||||
|
||||
// Will be build by CMake and placed inside the build directory
|
||||
#include "ShaderLogisticRegression.hpp"
|
||||
|
@ -126,6 +126,9 @@ class Tensor
|
||||
void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
|
||||
std::shared_ptr<Tensor> copyFromTensor);
|
||||
|
||||
void recordFill(const vk::CommandBuffer &commandBuffer,
|
||||
uint32_t fill);
|
||||
|
||||
/**
|
||||
* Records a copy from the internal staging memory to the device memory
|
||||
* using an optional barrier to wait for the operation. This function would
|
||||
@ -279,6 +282,7 @@ class Tensor
|
||||
vk::Buffer *bufferTo,
|
||||
vk::DeviceSize bufferSize,
|
||||
vk::BufferCopy copyRegion);
|
||||
|
||||
void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
||||
const vk::Buffer& buffer,
|
||||
vk::AccessFlagBits srcAccessMask,
|
||||
|
58
kompute/src/include/kompute/operations/OpTensorFill.hpp
Normal file
58
kompute/src/include/kompute/operations/OpTensorFill.hpp
Normal file
@ -0,0 +1,58 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
#pragma once
|
||||
|
||||
#include "kompute/Core.hpp"
|
||||
|
||||
#include "kompute/Tensor.hpp"
|
||||
|
||||
#include "kompute/operations/OpBase.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
/**
|
||||
* Operation that fills the tensor
|
||||
*/
|
||||
class OpTensorFill : public OpBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Default constructor with parameters that provides the core vulkan
|
||||
* resources and the tensors that will be used in the operation.
|
||||
*
|
||||
* @param tensors Tensors that will be used to create in operation.
|
||||
*/
|
||||
OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Default destructor. This class does not manage memory so it won't be
|
||||
* expecting the parent to perform a release.
|
||||
*/
|
||||
~OpTensorFill() override;
|
||||
|
||||
/**
|
||||
* Records the fill command for tensor.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
void record(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any postEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
private:
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<std::shared_ptr<Tensor>> mTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
if (ggml_vk_has_device() && params.n_gpu_layers > 0
|
||||
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|
||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|
||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
|
||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|
||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
|
||||
// this allocates all Vulkan resources and memory buffers
|
||||
ctx->ctx_kompute = ggml_vk_init();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user