llama.cpp/kompute-shaders/op_mul_mv_q_n.comp

void main() {
    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
    if (gl_SubgroupInvocationID > 31)
        return;

    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);

    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint im = gl_WorkGroupID.z;

    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;

    const uint i12 = im%pcs.ne12;
    const uint i13 = im/pcs.ne12;

    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);

    const uint x = offset0; // Based from inA without base offset
    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB

    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};

    const uint ix = gl_SubgroupInvocationID/2;
    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);

    uint yb = y + ix * BLOCKS_IN_QUANT + il;

    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);

    for (uint ib = ix; ib < nb; ib += 16) {
        for (int row = 0; row < N_ROWS; row++) {
            const uint block_index = x + ib + row * nb;
            sumf[row] += block_q_n_dot_y(block_index, yb, il);
        }

        yb += BLOCKS_IN_QUANT * 16;
    }

    for (int row = 0; row < N_ROWS; ++row) {
        const float tot = subgroupAdd(sumf[row]);
        if (first_row + row < pcs.ne01 && subgroupElect()) {
            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
        }
    }
}
Nomic Vulkan backend (#4456) Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: niansa <anton-sa@web.de> Co-authored-by: Adam Treat <treat.adam@gmail.com> Co-authored-by: Aaron Miller <apage43@ninjawhale.com> Co-authored-by: ToKiNoBug <tokinobug@163.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com> 2024-01-29 21:50:50 +01:00			`void main() {`
			`// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64`
			`if (gl_SubgroupInvocationID > 31)`
			`return;`

			`const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);`

			`const uint r0 = gl_WorkGroupID.x;`
			`const uint r1 = gl_WorkGroupID.y;`
			`const uint im = gl_WorkGroupID.z;`

			`const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;`

			`const uint i12 = im%pcs.ne12;`
			`const uint i13 = im/pcs.ne12;`

			`const uint offset0 = first_row * nb + (i12/pcs.r2)(nbpcs.ne01) + (i13/pcs.r3)(nbpcs.ne01*pcs.ne02);`

			`const uint x = offset0; // Based from inA without base offset`
			`const uint y = r1uint(pcs.ne10)+impcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB`

			`float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};`

			`const uint ix = gl_SubgroupInvocationID/2;`
			`const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);`

			`uint yb = y + ix * BLOCKS_IN_QUANT + il;`

			`//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",`
			`// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,`
			`// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);`

			`for (uint ib = ix; ib < nb; ib += 16) {`
			`for (int row = 0; row < N_ROWS; row++) {`
			`const uint block_index = x + ib + row * nb;`
			`sumf[row] += block_q_n_dot_y(block_index, yb, il);`
			`}`

			`yb += BLOCKS_IN_QUANT * 16;`
			`}`

			`for (int row = 0; row < N_ROWS; ++row) {`
			`const float tot = subgroupAdd(sumf[row]);`
			`if (first_row + row < pcs.ne01 && subgroupElect()) {`
			`out_[r1pcs.ne0 + impcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;`
			`}`
			`}`
			`}`