/**
 * Copyright (c) 2023 Nomic, Inc. All rights reserved.
 *
 * This software is licensed under the terms of the Software for Open Models License (SOM),
 * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
 * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
 */

#version 450

#include "common.comp"

layout(local_size_x = 1) in;

layout (binding = 0) readonly buffer tensorIn { float in_[]; };
layout (binding = 1) writeonly buffer tensorOut { float out_[]; };

layout (push_constant) uniform parameter {
    uint inOff;
    uint outOff;
    uint n_past;
    int n_dims;
    int mode;
    float freq_base;
    float freq_scale;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
} pcs;

void main() {
    const uint i3 = gl_WorkGroupID.z;
    const uint i2 = gl_WorkGroupID.y;
    const uint i1 = gl_WorkGroupID.x;

    const bool is_neox = (pcs.mode & 2) != 0;
    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);

    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);

    float theta = pcs.freq_scale * float(p);

    if (!is_neox) {
        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
            const float cos_theta = cos(theta);
            const float sin_theta = sin(theta);

            theta *= theta_scale;

            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_

            const float x0 = in_[src];
            const float x1 = in_[src+1];

            out_[dst_data] = x0*cos_theta - x1*sin_theta;
            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
        }
    } else {
        const float inv_ndims = -1.f/pcs.n_dims;
        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
                const float cos_theta = cos(theta);
                const float sin_theta = sin(theta);

                theta *= theta_scale;

                const uint i0 = ib*pcs.n_dims + ic/2;

                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_

                const float x0 = in_[src];
                const float x1 = in_[src+pcs.n_dims/2];

                out_[dst_data] = x0*cos_theta - x1*sin_theta;
                out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
            }
        }
    }
}