llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp

#version 450

#include "types.comp"

#extension GL_EXT_shader_16bit_storage : require

layout(push_constant) uniform parameter {
    uint IW; uint IH;
    uint OW; uint OH;
    uint OC;
    uint pelements;
    uint op;
    int k0; int k1;
    int s0; int s1;
    int p0; int p1;
} p;

#define BLOCK_SIZE 512
#define FLT_MAX 3.402823466e+38F
#define OP_POOL_MAX 0u
#define OP_POOL_AVG 1u

layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
    const uint idx = gl_GlobalInvocationID.x;
    if (idx >= p.pelements) {
        return;
    }

    const uint O_HW = p.OW * p.OH;

    const uint nc = idx / O_HW;
    const uint cur_oh = (idx % O_HW) / p.OW;
    const uint cur_ow = (idx % O_HW) % p.OW;

    const int start_h = int(cur_oh) * p.s0 - p.p0;
    const uint bh = max(start_h, 0);
    const uint eh = min(start_h + p.k0, p.IH);

    const int start_w = int(cur_ow) * p.s1 - p.p1;
    const uint bw = max(start_w, 0);
    const uint ew = min(start_w + p.k1, p.IW);

    const float scale = 1.0 / float(p.k0 * p.k1);
    float res;

    if (p.op == OP_POOL_AVG) {
        res = 0.0;
    } else if (p.op == OP_POOL_MAX) {
        res = -FLT_MAX;
    } else {
        return;
    }

    #pragma unroll
    for (uint i = bh; i < eh; i++) {
        #pragma unroll
        for (uint j = bw; j < ew; j++) {
            const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);

            if (p.op == OP_POOL_AVG) {
                res += cur * scale;
            } else if (p.op == OP_POOL_MAX) {
                res = max(res, cur);
            }
        }
    }

    data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
}
ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (#9763) * ggml: Add POOL2D OP for GPU ACC to the Vulkan. - The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend. - A GGML_OP_POOL_2D shader has been added. (Pooling) - The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Correct the incorrect order of the parameters. fix casting to int. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> 2024-10-29 09:52:56 +01:00			`#version 450`

			`#include "types.comp"`

			`#extension GL_EXT_shader_16bit_storage : require`

			`layout(push_constant) uniform parameter {`
			`uint IW; uint IH;`
			`uint OW; uint OH;`
			`uint OC;`
			`uint pelements;`
			`uint op;`
			`int k0; int k1;`
			`int s0; int s1;`
			`int p0; int p1;`
			`} p;`

			`#define BLOCK_SIZE 512`
			`#define FLT_MAX 3.402823466e+38F`
			`#define OP_POOL_MAX 0u`
			`#define OP_POOL_AVG 1u`

			`layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;`

			`layout(binding = 0) readonly buffer X {A_TYPE data_a[];};`
			`layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};`

			`void main() {`
			`const uint idx = gl_GlobalInvocationID.x;`
			`if (idx >= p.pelements) {`
			`return;`
			`}`

			`const uint O_HW = p.OW * p.OH;`

			`const uint nc = idx / O_HW;`
			`const uint cur_oh = (idx % O_HW) / p.OW;`
			`const uint cur_ow = (idx % O_HW) % p.OW;`

			`const int start_h = int(cur_oh) * p.s0 - p.p0;`
			`const uint bh = max(start_h, 0);`
			`const uint eh = min(start_h + p.k0, p.IH);`

			`const int start_w = int(cur_ow) * p.s1 - p.p1;`
			`const uint bw = max(start_w, 0);`
			`const uint ew = min(start_w + p.k1, p.IW);`

			`const float scale = 1.0 / float(p.k0 * p.k1);`
			`float res;`

			`if (p.op == OP_POOL_AVG) {`
			`res = 0.0;`
			`} else if (p.op == OP_POOL_MAX) {`
			`res = -FLT_MAX;`
			`} else {`
			`return;`
			`}`

			`#pragma unroll`
			`for (uint i = bh; i < eh; i++) {`
			`#pragma unroll`
			`for (uint j = bw; j < ew; j++) {`
			`const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);`

			`if (p.op == OP_POOL_AVG) {`
			`res += cur * scale;`
			`} else if (p.op == OP_POOL_MAX) {`
			`res = max(res, cur);`
			`}`
			`}`
			`}`

			`data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;`
			`}`