mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
a3738b2fa7
* Fix Vulkan repeat op * Implement Vulkan concat op * Delete old Vulkan shader generator * Implement Vulkan im2col op * Implement Vulkan unary gelu_quick op * Implement Vulkan group_norm op * Implement Vulkan timestep_embedding op * Implement Vulkan upscale op * Fix Vulkan vk_context tensor extra index issue * Fix Vulkan matmul shader parameter bug * Properly fix Vulkan matmul shader parameter bug * Add Vulkan ADD f16 + f32 -> f16 operator support * Implement Vulkan tanh op * Fix Vulkan group count too large Validation error on non-Nvidia GPUs * Throw error when too much memory is requested * Fix another Vulkan group count too large Validation error on non-Nvidia GPUs * Fix matmul MMQ condition * Implement Vulkan pad op * Fix Vulkan crash when tensor is used multiple times in a compute graph * Add Vulkan CONCAT f16 + f16 -> f16 op * Add Vulkan LEAKY_RELU op
40 lines
1.5 KiB
Plaintext
40 lines
1.5 KiB
Plaintext
#extension GL_EXT_shader_16bit_storage : require
|
|
|
|
layout (push_constant) uniform parameter
|
|
{
|
|
uint ne;
|
|
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
|
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
|
uint d_offset;
|
|
float param1; float param2;
|
|
} p;
|
|
|
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|
|
|
uint get_idx() {
|
|
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
|
}
|
|
|
|
uint src0_idx(uint idx) {
|
|
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
|
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
|
const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
|
|
const uint i02_offset = i02*p.ne01*p.ne00;
|
|
const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
|
|
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
|
|
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
|
|
}
|
|
|
|
uint dst_idx(uint idx) {
|
|
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
|
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
|
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
|
|
const uint i12_offset = i12*p.ne11*p.ne10;
|
|
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
|
|
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
|
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
|
|
}
|