diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c0c9edd56..17154ec1a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4596,14 +4596,12 @@ static __global__ void rope_neox(
     const int i = row*ncols + col/2;
     const int i2 = row/p_delta_rows;
 
-    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
-    const float cur_rot = -float(col)/ncols;
-
     const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, cur_rot);
+    const float theta_base = p*powf(freq_base, -float(col)/ncols);
 
+    // rotation amount is `ib * ncols + col`, but ib is assumed to be zero
     float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + ncols/2];
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 5d1357cd7..e8032dd5b 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1277,10 +1277,9 @@ kernel void kernel_rope(
         for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
             for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
 
-                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cur_rot = inv_ndims*ic - ib;
+                const int64_t cur_rot = ib * n_dims + ic;
 
-                const float theta = theta_0 * pow(freq_base, cur_rot);
+                const float theta = theta_0 * pow(freq_base, inv_ndims*cur_rot);
                 float cos_theta, sin_theta;
                 rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
diff --git a/ggml.c b/ggml.c
index 3202a517b..012b326ba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11084,8 +11084,7 @@ static void ggml_compute_forward_rope_f32(
                     theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
+                            int64_t cur_rot = ib * n_dims + ic;
 
                             float cos_theta, sin_theta;
                             rope_yarn(
@@ -11237,8 +11236,7 @@ static void ggml_compute_forward_rope_f16(
                     theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
+                            int64_t cur_rot = ib * n_dims + ic;
 
                             float cos_theta, sin_theta;
                             rope_yarn(