cuda : avoid warp_reduce for smax

2025-02-05 16:10:42 +01:00 · 2024-02-03 13:17:47 +02:00 · 2024-02-03 13:17:47 +02:00 · b150abe83e
commit b150abe83e
parent b68a112204
1 changed files with 2 additions and 1 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6621,7 +6621,6 @@ static __global__ void flash_attn_ext_f16(
                        M[j] = __hmax(M[j], s);
                    }

-                    smax = warp_reduce_max(smax);
                    M[j] = warp_reduce_max(M[j]);

                    const half ms = __hisinf(m) == -1 ? __float2half(0.0f) : hexp(m - M[j]);
@ -6649,6 +6648,8 @@ static __global__ void flash_attn_ext_f16(
                }
            }

+            smax = warp_reduce_max(smax);
+
            // skip -INF blocks
            if (__hisinf(smax) == -1) {
                continue;