mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-15 14:50:51 +01:00
vulkan : optimize workgroup sizes
This commit is contained in:
parent
84f7fc4553
commit
39abedd1d7
@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(__func__))
|
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||||
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
|
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
|
||||||
else {
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||||
s_algo->setTensors({in, out});
|
s_algo->setTensors({in, out});
|
||||||
s_algo->setWorkgroup({(uint32_t)nrows});
|
s_algo->setWorkgroup({(uint32_t)nrows});
|
||||||
|
@ -10,13 +10,12 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 32
|
|
||||||
#define IN_TYPE float16_t
|
#define IN_TYPE float16_t
|
||||||
#define IN_TYPE_SIZE 2
|
#define IN_TYPE_SIZE 2
|
||||||
#define OUT_TYPE float16_t
|
#define OUT_TYPE float16_t
|
||||||
#define OUT_TYPE_SIZE 2
|
#define OUT_TYPE_SIZE 2
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
layout(local_size_x = 1024) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||||
@ -54,7 +53,7 @@ void main() {
|
|||||||
|
|
||||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||||
|
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||||
}
|
}
|
||||||
|
@ -10,13 +10,12 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 32
|
|
||||||
#define IN_TYPE float16_t
|
#define IN_TYPE float16_t
|
||||||
#define IN_TYPE_SIZE 2
|
#define IN_TYPE_SIZE 2
|
||||||
#define OUT_TYPE float
|
#define OUT_TYPE float
|
||||||
#define OUT_TYPE_SIZE 4
|
#define OUT_TYPE_SIZE 4
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
layout(local_size_x = 1024) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||||
@ -54,7 +53,7 @@ void main() {
|
|||||||
|
|
||||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||||
|
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||||
}
|
}
|
||||||
|
@ -10,13 +10,12 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 32
|
|
||||||
#define IN_TYPE float
|
#define IN_TYPE float
|
||||||
#define IN_TYPE_SIZE 4
|
#define IN_TYPE_SIZE 4
|
||||||
#define OUT_TYPE float16_t
|
#define OUT_TYPE float16_t
|
||||||
#define OUT_TYPE_SIZE 2
|
#define OUT_TYPE_SIZE 2
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
layout(local_size_x = 1024) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||||
@ -54,7 +53,7 @@ void main() {
|
|||||||
|
|
||||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||||
|
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||||
}
|
}
|
||||||
|
@ -2,13 +2,12 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 32
|
|
||||||
#define IN_TYPE float
|
#define IN_TYPE float
|
||||||
#define IN_TYPE_SIZE 4
|
#define IN_TYPE_SIZE 4
|
||||||
#define OUT_TYPE float
|
#define OUT_TYPE float
|
||||||
#define OUT_TYPE_SIZE 4
|
#define OUT_TYPE_SIZE 4
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
layout(local_size_x = 1024) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||||
@ -46,7 +45,7 @@ void main() {
|
|||||||
|
|
||||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||||
|
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,7 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 256
|
layout(local_size_x = 256) in;
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
|
||||||
|
|
||||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||||
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
|
|||||||
float eps;
|
float eps;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float sum[nth];
|
shared float sum[gl_WorkGroupSize.x];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||||
// MEAN
|
// MEAN
|
||||||
// parallel sum
|
// parallel sum
|
||||||
sum[gl_LocalInvocationID.x] = 0.0;
|
sum[gl_LocalInvocationID.x] = 0.0;
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
// reduce
|
||||||
barrier();
|
barrier();
|
||||||
memoryBarrierShared();
|
memoryBarrierShared();
|
||||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||||
if (gl_LocalInvocationID.x < i) {
|
if (gl_LocalInvocationID.x < i) {
|
||||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||||
}
|
}
|
||||||
@ -57,21 +55,21 @@ void main() {
|
|||||||
|
|
||||||
// recenter
|
// recenter
|
||||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
out_[y+i00] = in_[x+i00] - mean;
|
out_[y+i00] = in_[x+i00] - mean;
|
||||||
}
|
}
|
||||||
|
|
||||||
// VARIANCE
|
// VARIANCE
|
||||||
// parallel sum
|
// parallel sum
|
||||||
sum[gl_LocalInvocationID.x] = 0.0;
|
sum[gl_LocalInvocationID.x] = 0.0;
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
// reduce
|
||||||
barrier();
|
barrier();
|
||||||
memoryBarrierShared();
|
memoryBarrierShared();
|
||||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||||
if (gl_LocalInvocationID.x < i) {
|
if (gl_LocalInvocationID.x < i) {
|
||||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||||
}
|
}
|
||||||
@ -88,7 +86,7 @@ void main() {
|
|||||||
const float variance = sum[0];
|
const float variance = sum[0];
|
||||||
|
|
||||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
out_[y+i00] *= scale;
|
out_[y+i00] *= scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,7 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
#define nth 512
|
layout(local_size_x = 512) in;
|
||||||
|
|
||||||
layout(local_size_x = nth) in;
|
|
||||||
|
|
||||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||||
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
|
|||||||
float eps;
|
float eps;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
shared float sum[nth];
|
shared float sum[gl_WorkGroupSize.x];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
sum[gl_LocalInvocationID.x] = 0.0;
|
sum[gl_LocalInvocationID.x] = 0.0;
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
// reduce
|
||||||
barrier();
|
barrier();
|
||||||
memoryBarrierShared();
|
memoryBarrierShared();
|
||||||
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
|
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||||
if (gl_LocalInvocationID.x < i) {
|
if (gl_LocalInvocationID.x < i) {
|
||||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||||
}
|
}
|
||||||
@ -57,7 +55,7 @@ void main() {
|
|||||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||||
|
|
||||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
|
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||||
out_[y+i00] = in_[x+i00] * scale;
|
out_[y+i00] = in_[x+i00] * scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
|
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||||
layout(local_size_x = 1) in;
|
layout(local_size_x = 1) in;
|
||||||
|
|
||||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||||
|
Loading…
Reference in New Issue
Block a user