mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-15 14:50:51 +01:00
Consolidate code for mat x vec kernels and use subgroups more extensively.
This commit is contained in:
parent
77135a3bf5
commit
93306f16d0
@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
|
|||||||
if (heapSize < memoryRequired)
|
if (heapSize < memoryRequired)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
vk::PhysicalDeviceSubgroupProperties subgroupProperties;
|
||||||
|
vk::PhysicalDeviceProperties2 deviceProperties2;
|
||||||
|
deviceProperties2.pNext = &subgroupProperties;
|
||||||
|
physicalDevices.at(i).getProperties2(&deviceProperties2);
|
||||||
|
|
||||||
|
if (subgroupProperties.subgroupSize < 32)
|
||||||
|
continue;
|
||||||
|
|
||||||
ggml_vk_device d;
|
ggml_vk_device d;
|
||||||
d.index = i;
|
d.index = i;
|
||||||
d.type = properties.deviceType;
|
d.type = properties.deviceType;
|
||||||
d.heapSize = heapSize;
|
d.heapSize = heapSize;
|
||||||
d.name = properties.deviceName;
|
d.name = properties.deviceName;
|
||||||
|
d.subgroupSize = subgroupProperties.subgroupSize;
|
||||||
size_t n_idx = ++count_by_name[d.name];
|
size_t n_idx = ++count_by_name[d.name];
|
||||||
if (n_idx > 1) {
|
if (n_idx > 1) {
|
||||||
d.name += " (" + std::to_string(n_idx) + ")";
|
d.name += " (" + std::to_string(n_idx) + ")";
|
||||||
@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
|
|||||||
bool ggml_vk_init_device(int device) {
|
bool ggml_vk_init_device(int device) {
|
||||||
komputeManager()->initializeDevice(device, {},
|
komputeManager()->initializeDevice(device, {},
|
||||||
{"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
|
{"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
|
||||||
"VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
|
"VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
|
||||||
return ggml_vk_has_device();
|
return ggml_vk_has_device();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(__func__))
|
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
|
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||||
else {
|
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
|
||||||
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||||
s_algo->setTensors({in, out});
|
s_algo->setTensors({in, out});
|
||||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
||||||
@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(__func__))
|
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
|
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||||
else {
|
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
||||||
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||||
s_algo->setTensors({inA, inB, out});
|
s_algo->setTensors({inA, inB, out});
|
||||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
|
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
|
||||||
@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
|
|||||||
const std::shared_ptr<kp::Tensor>& inB,
|
const std::shared_ptr<kp::Tensor>& inB,
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||||
int32_t ne00, int32_t ne10, int32_t ne0,
|
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
|
||||||
int32_t ne01, int32_t ne11) {
|
int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, outOff;
|
||||||
int32_t ne00, ne10, ne0;
|
int32_t ne00, ne10, ne0, ne1, ne01, gqa;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne10, ne0,
|
ne00, ne10, ne0, ne1, ne01, ne12/ne02
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(__func__))
|
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
|
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
||||||
else {
|
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
||||||
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||||
s_algo->setTensors({inA, inB, out});
|
s_algo->setTensors({inA, inB, out});
|
||||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
|
s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
|
||||||
s_algo->setPushConstants<PushConstants>({pushConsts});
|
s_algo->setPushConstants<PushConstants>({pushConsts});
|
||||||
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
||||||
}
|
}
|
||||||
|
seq.record<kp::OpTensorFill>({out});
|
||||||
seq.record<kp::OpAlgoDispatch>(s_algo);
|
seq.record<kp::OpAlgoDispatch>(s_algo);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
|
|||||||
const uint32_t nb3 = dst ? dst->nb[3] : 0;
|
const uint32_t nb3 = dst ? dst->nb[3] : 0;
|
||||||
|
|
||||||
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
||||||
// const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
|
|
||||||
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
||||||
@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
|
if (src1t != GGML_TYPE_F32) {
|
||||||
&& src1->type == GGML_TYPE_F32) {
|
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||||
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
|
|
||||||
} else if (src0->type == GGML_TYPE_Q4_0
|
|
||||||
&& src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
|
|
||||||
} else if (src0->type == GGML_TYPE_Q4_1
|
|
||||||
&& src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
|
|
||||||
goto not_implemented;
|
goto not_implemented;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!ggml_is_transposed(src0)
|
||||||
|
&& !ggml_is_transposed(src1)
|
||||||
|
&& ne00%32 == 0
|
||||||
|
&& ne11 > 1) {
|
||||||
|
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||||
|
goto not_implemented;
|
||||||
|
} else {
|
||||||
|
switch (src0t) {
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
|
||||||
|
break;
|
||||||
|
default: {
|
||||||
|
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
|
||||||
|
goto not_implemented;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
if (src0->type == GGML_TYPE_F16) {
|
if (src0t == GGML_TYPE_F16) {
|
||||||
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
} else if (src0->type == GGML_TYPE_Q4_0) {
|
} else if (src0t == GGML_TYPE_Q4_0) {
|
||||||
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
} else if (src0->type == GGML_TYPE_Q4_1) {
|
} else if (src0t == GGML_TYPE_Q4_1) {
|
||||||
ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
|
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
|
||||||
goto not_implemented;
|
goto not_implemented;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -34,6 +34,7 @@ struct ggml_vk_device {
|
|||||||
size_t heapSize = 0;
|
size_t heapSize = 0;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string vendor;
|
std::string vendor;
|
||||||
|
int subgroupSize = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
|
std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
|
||||||
|
@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
|
|||||||
const uint nb = k / qk;
|
const uint nb = k / qk;
|
||||||
|
|
||||||
for (uint i = 0; i < nb; i++) {
|
for (uint i = 0; i < nb; i++) {
|
||||||
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
|
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
|
||||||
|
|
||||||
const float16_t d = block.d;
|
const float16_t d = block.d;
|
||||||
const float16_t m = block.m;
|
const float16_t m = block.m;
|
||||||
|
@ -10,7 +10,9 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
layout(local_size_x = 64) in;
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||||
|
|
||||||
|
layout(local_size_x_id = 0) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||||
@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
|
|||||||
int ne1;
|
int ne1;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float sum[gl_WorkGroupSize.x];
|
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint r0 = gl_WorkGroupID.x;
|
const uint r0 = gl_WorkGroupID.x;
|
||||||
const uint r1 = gl_WorkGroupID.y;
|
const uint r1 = gl_WorkGroupID.y;
|
||||||
@ -39,24 +39,13 @@ void main() {
|
|||||||
const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
|
const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
|
||||||
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||||
|
|
||||||
sum[gl_LocalInvocationID.x] = 0.0;
|
float sumf = 0.0f;
|
||||||
|
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||||
for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
|
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||||
sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// accumulate the sum from all threads in the threadgroup
|
const float all_sum = subgroupAdd(sumf);
|
||||||
barrier();
|
if (subgroupElect()) {
|
||||||
memoryBarrierShared();
|
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
|
||||||
if (gl_LocalInvocationID.x < i) {
|
|
||||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (gl_LocalInvocationID.x == 0) {
|
|
||||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,13 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
layout(local_size_x = 8, local_size_y = 8) in;
|
#define BLOCKS_IN_QUANT QK4_0
|
||||||
|
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||||
|
#define N_ROWS 4
|
||||||
|
|
||||||
|
layout(local_size_x_id = 0) in;
|
||||||
|
layout(local_size_y = 1) in;
|
||||||
|
layout(local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||||
@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
|
|||||||
int ne00;
|
int ne00;
|
||||||
int ne10;
|
int ne10;
|
||||||
int ne0;
|
int ne0;
|
||||||
|
int ne1;
|
||||||
|
int ne01;
|
||||||
|
int gqa;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float sum[64];
|
// The q4_0 version of this function
|
||||||
|
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||||
|
vec2 acc = vec2(0.0, 0.0);
|
||||||
|
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||||
|
float d = float(u8BufToFloat16(inA, index));
|
||||||
|
float sumy = 0.0f;
|
||||||
|
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||||
|
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||||
|
|
||||||
void main() {
|
const float yl0 = inB[yb + i];
|
||||||
const uint nb = uint(pcs.ne00/QK4_0);
|
const float yl1 = inB[yb + i + 1];
|
||||||
|
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||||
|
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||||
|
|
||||||
const uint r0 = gl_WorkGroupID.x;
|
sumy += yl0 + yl1 + yl8 + yl9;
|
||||||
const uint r1 = gl_WorkGroupID.y;
|
|
||||||
|
|
||||||
const uint x = r0*nb; // Based from inA without base offset
|
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||||
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
|
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||||
|
|
||||||
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
|
|
||||||
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
|
|
||||||
|
|
||||||
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
|
|
||||||
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
|
|
||||||
|
|
||||||
const uint first = 4 * iy;
|
|
||||||
|
|
||||||
float sumf = 0.0;
|
|
||||||
|
|
||||||
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
|
|
||||||
const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
|
|
||||||
const float d = float(u8BufToFloat16(inA, index));
|
|
||||||
|
|
||||||
const uint xl = first; // Based from bl->qs
|
|
||||||
const uint yl = y + i * QK4_0 + first; // Based from inB
|
|
||||||
|
|
||||||
vec2 acc = vec2(0.0, 0.0);
|
|
||||||
|
|
||||||
for (int j = 0; j < 4; ++j) {
|
|
||||||
const uint8_t b = inA[index+2+xl+j];
|
|
||||||
acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
|
|
||||||
acc.y += inB[yl+j] + inB[yl+j+16];
|
|
||||||
}
|
|
||||||
|
|
||||||
sumf += d * (acc.x - 8.*acc.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
sum[ith] = sumf;
|
|
||||||
|
|
||||||
//
|
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
|
||||||
//
|
|
||||||
barrier();
|
|
||||||
if (ith == 0) {
|
|
||||||
float sumTotal = 0.0;
|
|
||||||
for (uint i = 0; i < nth; ++i) {
|
|
||||||
sumTotal += sum[i];
|
|
||||||
}
|
|
||||||
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
|
|
||||||
}
|
}
|
||||||
|
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "op_mul_mv_q_n.comp"
|
||||||
|
@ -10,7 +10,13 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
layout(local_size_x = 8, local_size_y = 8) in;
|
#define BLOCKS_IN_QUANT QK4_1
|
||||||
|
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||||
|
#define N_ROWS 4
|
||||||
|
|
||||||
|
layout(local_size_x_id = 0) in;
|
||||||
|
layout(local_size_y = 1) in;
|
||||||
|
layout(local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||||
@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
|
|||||||
int ne00;
|
int ne00;
|
||||||
int ne10;
|
int ne10;
|
||||||
int ne0;
|
int ne0;
|
||||||
|
int ne1;
|
||||||
|
int ne01;
|
||||||
|
int gqa;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
|
// The q4_1 version of this function
|
||||||
|
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||||
|
vec2 acc = vec2(0.0, 0.0);
|
||||||
|
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||||
|
float d = float(u8BufToFloat16(inA, index));
|
||||||
|
float m = float(u8BufToFloat16(inA, index+2));
|
||||||
|
|
||||||
#define UNALIGNED_INPUT inA
|
float sumy = 0.0f;
|
||||||
|
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||||
|
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||||
|
|
||||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
const float yl0 = inB[yb + i];
|
||||||
block_q4_1 fres;
|
const float yl1 = inB[yb + i + 1];
|
||||||
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
|
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||||
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
|
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
|
||||||
fres.qs[it] = UNALIGNED_INPUT[index+4+it];
|
sumy += yl0 + yl1 + yl8 + yl9;
|
||||||
|
|
||||||
|
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||||
|
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||||
}
|
}
|
||||||
return fres;
|
return d * (acc[0] + acc[1]) + sumy * m;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
#include "op_mul_mv_q_n.comp"
|
||||||
const uint nb = uint(pcs.ne00/QK4_1);
|
|
||||||
|
|
||||||
const uint r0 = gl_WorkGroupID.x;
|
|
||||||
const uint r1 = gl_WorkGroupID.y;
|
|
||||||
|
|
||||||
const uint x = r0*nb; // Based from inA without base offset
|
|
||||||
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
|
|
||||||
|
|
||||||
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
|
|
||||||
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
|
|
||||||
|
|
||||||
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
|
|
||||||
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
|
|
||||||
|
|
||||||
const uint first = 4 * iy;
|
|
||||||
|
|
||||||
float sumf = 0.0;
|
|
||||||
|
|
||||||
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
|
|
||||||
//TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
|
|
||||||
|
|
||||||
const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
|
|
||||||
|
|
||||||
const float d = float(block.d);
|
|
||||||
const float m = float(block.m);
|
|
||||||
|
|
||||||
const uint xl = first; // Based from bl->qs
|
|
||||||
const uint yl = y + i * QK4_1 + first; // Based from inB
|
|
||||||
|
|
||||||
vec2 acc = vec2(0.0, 0.0);
|
|
||||||
|
|
||||||
for (int j = 0; j < 4; ++j) {
|
|
||||||
acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
|
|
||||||
acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
|
|
||||||
}
|
|
||||||
|
|
||||||
sumf += d * (acc.x - acc.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
sum[ith] = sumf;
|
|
||||||
|
|
||||||
//
|
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
|
||||||
//
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
if (ith%4 == 0) {
|
|
||||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
if (ith%16 == 0) {
|
|
||||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
if (ith == 0) {
|
|
||||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
||||||
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
49
kompute/op_mul_mv_q_n.comp
Normal file
49
kompute/op_mul_mv_q_n.comp
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
|
||||||
|
*
|
||||||
|
* This software is licensed under the terms of the Software for Open Models License (SOM),
|
||||||
|
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
|
||||||
|
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||||
|
#extension GL_EXT_debug_printf : enable
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||||
|
const uint r0 = gl_WorkGroupID.x;
|
||||||
|
const uint r1 = gl_WorkGroupID.y;
|
||||||
|
const uint im = gl_WorkGroupID.z;
|
||||||
|
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||||
|
const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
|
||||||
|
|
||||||
|
const uint x = offset0; // Based from inA without base offset
|
||||||
|
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||||
|
|
||||||
|
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||||
|
|
||||||
|
const uint ix = gl_SubgroupInvocationID/2;
|
||||||
|
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||||
|
|
||||||
|
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||||
|
|
||||||
|
debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||||
|
gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||||
|
gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||||
|
|
||||||
|
for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
|
||||||
|
for (int row = 0; row < N_ROWS; row++) {
|
||||||
|
const uint block_index = x + ib + row * nb;
|
||||||
|
sumf[row] += block_q_n_dot_y(block_index, yb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int row = 0; row < N_ROWS; ++row) {
|
||||||
|
const float tot = subgroupAdd(sumf[row]);
|
||||||
|
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||||
|
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -10,9 +10,9 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 32
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
layout(local_size_x_id = 0) in;
|
||||||
|
|
||||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||||
@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
|
|||||||
int ne02;
|
int ne02;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float buf[nth];
|
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint i03 = gl_WorkGroupID.z;
|
const uint i03 = gl_WorkGroupID.z;
|
||||||
const uint i02 = gl_WorkGroupID.y;
|
const uint i02 = gl_WorkGroupID.y;
|
||||||
@ -37,46 +35,22 @@ void main() {
|
|||||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
|
float localMax = uintBitsToFloat(0xFF800000);
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||||
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
|
localMax = max(localMax, in_[psrc0 + i00]);
|
||||||
}
|
}
|
||||||
|
float max_ = subgroupMax(localMax);
|
||||||
// reduce
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
|
||||||
if (gl_LocalInvocationID.x < i) {
|
|
||||||
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
}
|
|
||||||
|
|
||||||
// broadcast
|
|
||||||
const float max_ = buf[0];
|
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
buf[gl_LocalInvocationID.x] = 0.0;
|
float localSum = 0.0f;
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||||
buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
|
const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
|
||||||
|
localSum += exp_psrc0;
|
||||||
|
out_[pdst + i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
const float sum = subgroupAdd(localSum);
|
||||||
barrier();
|
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
|
||||||
memoryBarrierShared();
|
out_[pdst + i00] /= sum;
|
||||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
|
||||||
if (gl_LocalInvocationID.x < i) {
|
|
||||||
buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
memoryBarrierShared();
|
|
||||||
}
|
|
||||||
|
|
||||||
// broadcast
|
|
||||||
const float sum = buf[0];
|
|
||||||
|
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
|
||||||
out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
|
|||||||
OpAlgoDispatch.cpp
|
OpAlgoDispatch.cpp
|
||||||
OpMemoryBarrier.cpp
|
OpMemoryBarrier.cpp
|
||||||
OpTensorCopy.cpp
|
OpTensorCopy.cpp
|
||||||
|
OpTensorFill.cpp
|
||||||
OpTensorSyncDevice.cpp
|
OpTensorSyncDevice.cpp
|
||||||
OpTensorSyncLocal.cpp
|
OpTensorSyncLocal.cpp
|
||||||
OpBufferSyncDevice.cpp
|
OpBufferSyncDevice.cpp
|
||||||
|
55
kompute/src/OpTensorFill.cpp
Normal file
55
kompute/src/OpTensorFill.cpp
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
|
||||||
|
*
|
||||||
|
* This software is licensed under the terms of the Software for Open Models License (SOM),
|
||||||
|
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
|
||||||
|
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "kompute/operations/OpTensorFill.hpp"
|
||||||
|
#include "kompute/Tensor.hpp"
|
||||||
|
|
||||||
|
namespace kp {
|
||||||
|
|
||||||
|
OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||||
|
{
|
||||||
|
KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
|
||||||
|
|
||||||
|
if (tensors.size() < 1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Kompute OpTensorFill called with less than 1 tensor");
|
||||||
|
}
|
||||||
|
|
||||||
|
this->mTensors = tensors;
|
||||||
|
}
|
||||||
|
|
||||||
|
OpTensorFill::~OpTensorFill()
|
||||||
|
{
|
||||||
|
KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
|
||||||
|
{
|
||||||
|
KP_LOG_DEBUG("Kompute OpTensorFill record called");
|
||||||
|
|
||||||
|
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||||
|
this->mTensors[i]->recordFill(commandBuffer, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
|
||||||
|
{
|
||||||
|
KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
|
||||||
|
{
|
||||||
|
KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
|
|||||||
commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
|
commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
|
||||||
|
uint32_t fill)
|
||||||
|
{
|
||||||
|
commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
||||||
vk::AccessFlagBits srcAccessMask,
|
vk::AccessFlagBits srcAccessMask,
|
||||||
|
@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
|
|||||||
kompute/operations/OpMemoryBarrier.hpp
|
kompute/operations/OpMemoryBarrier.hpp
|
||||||
kompute/operations/OpMult.hpp
|
kompute/operations/OpMult.hpp
|
||||||
kompute/operations/OpTensorCopy.hpp
|
kompute/operations/OpTensorCopy.hpp
|
||||||
|
kompute/operations/OpTensorFill.hpp
|
||||||
kompute/operations/OpTensorSyncDevice.hpp
|
kompute/operations/OpTensorSyncDevice.hpp
|
||||||
kompute/operations/OpTensorSyncLocal.hpp
|
kompute/operations/OpTensorSyncLocal.hpp
|
||||||
kompute/operations/OpBufferSyncDevice.hpp
|
kompute/operations/OpBufferSyncDevice.hpp
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include "operations/OpTensorSyncLocal.hpp"
|
#include "operations/OpTensorSyncLocal.hpp"
|
||||||
#include "operations/OpBufferSyncDevice.hpp"
|
#include "operations/OpBufferSyncDevice.hpp"
|
||||||
#include "operations/OpBufferSyncLocal.hpp"
|
#include "operations/OpBufferSyncLocal.hpp"
|
||||||
|
#include "operations/OpTensorFill.hpp"
|
||||||
|
|
||||||
// Will be build by CMake and placed inside the build directory
|
// Will be build by CMake and placed inside the build directory
|
||||||
#include "ShaderLogisticRegression.hpp"
|
#include "ShaderLogisticRegression.hpp"
|
||||||
|
@ -126,6 +126,9 @@ class Tensor
|
|||||||
void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
|
void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
|
||||||
std::shared_ptr<Tensor> copyFromTensor);
|
std::shared_ptr<Tensor> copyFromTensor);
|
||||||
|
|
||||||
|
void recordFill(const vk::CommandBuffer &commandBuffer,
|
||||||
|
uint32_t fill);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Records a copy from the internal staging memory to the device memory
|
* Records a copy from the internal staging memory to the device memory
|
||||||
* using an optional barrier to wait for the operation. This function would
|
* using an optional barrier to wait for the operation. This function would
|
||||||
@ -279,6 +282,7 @@ class Tensor
|
|||||||
vk::Buffer *bufferTo,
|
vk::Buffer *bufferTo,
|
||||||
vk::DeviceSize bufferSize,
|
vk::DeviceSize bufferSize,
|
||||||
vk::BufferCopy copyRegion);
|
vk::BufferCopy copyRegion);
|
||||||
|
|
||||||
void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
|
||||||
const vk::Buffer& buffer,
|
const vk::Buffer& buffer,
|
||||||
vk::AccessFlagBits srcAccessMask,
|
vk::AccessFlagBits srcAccessMask,
|
||||||
|
58
kompute/src/include/kompute/operations/OpTensorFill.hpp
Normal file
58
kompute/src/include/kompute/operations/OpTensorFill.hpp
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "kompute/Core.hpp"
|
||||||
|
|
||||||
|
#include "kompute/Tensor.hpp"
|
||||||
|
|
||||||
|
#include "kompute/operations/OpBase.hpp"
|
||||||
|
|
||||||
|
namespace kp {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Operation that fills the tensor
|
||||||
|
*/
|
||||||
|
class OpTensorFill : public OpBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* Default constructor with parameters that provides the core vulkan
|
||||||
|
* resources and the tensors that will be used in the operation.
|
||||||
|
*
|
||||||
|
* @param tensors Tensors that will be used to create in operation.
|
||||||
|
*/
|
||||||
|
OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default destructor. This class does not manage memory so it won't be
|
||||||
|
* expecting the parent to perform a release.
|
||||||
|
*/
|
||||||
|
~OpTensorFill() override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Records the fill command for tensor.
|
||||||
|
*
|
||||||
|
* @param commandBuffer The command buffer to record the command into.
|
||||||
|
*/
|
||||||
|
void record(const vk::CommandBuffer& commandBuffer) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does not perform any preEval commands.
|
||||||
|
*
|
||||||
|
* @param commandBuffer The command buffer to record the command into.
|
||||||
|
*/
|
||||||
|
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does not perform any postEval commands.
|
||||||
|
*
|
||||||
|
* @param commandBuffer The command buffer to record the command into.
|
||||||
|
*/
|
||||||
|
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// -------------- ALWAYS OWNED RESOURCES
|
||||||
|
std::vector<std::shared_ptr<Tensor>> mTensors;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // End namespace kp
|
@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
if (ggml_vk_has_device() && params.n_gpu_layers > 0
|
if (ggml_vk_has_device() && params.n_gpu_layers > 0
|
||||||
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|
||||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|
||||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
|
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|
||||||
|
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
|
||||||
// this allocates all Vulkan resources and memory buffers
|
// this allocates all Vulkan resources and memory buffers
|
||||||
ctx->ctx_kompute = ggml_vk_init();
|
ctx->ctx_kompute = ggml_vk_init();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user