From 69071d3b6b95952ef94b04f5f6f0fab730605fc7 Mon Sep 17 00:00:00 2001 From: Matvey Soloviev Date: Tue, 21 Mar 2023 22:55:35 +0100 Subject: [PATCH] Squeeze out about 5% more performance in Q4_1 inference --- ggml.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index 0e4b1466c..8f405468d 100644 --- a/ggml.c +++ b/ggml.c @@ -1702,7 +1702,7 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); // Accumulator for constant offsets - float acc_offset = 0.0f; + __m128 acc_offset = _mm_setzero_ps(); //0.0f; // Main loop for (int i = 0; i < nb; ++i) { @@ -1756,14 +1756,17 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); __m256 sums = _mm256_cvtepi32_ps( sumsi ); + // Apply the scales, and accumulate + // acc += d0*m1*x + d1*m0*y + acc = _mm256_fmadd_ps( cross_scales, sums, acc ); + // Convert int32_t to float __m256 p = _mm256_cvtepi32_ps( i32 ); - // Apply the scale, and accumulate - // acc += d0*d1*x*y + d0*m1*x + d1*m0*y + // acc += d0*d1*x*y acc = _mm256_fmadd_ps( scale_01, p, acc ); - acc = _mm256_fmadd_ps( cross_scales, sums, acc ); - // acc_offset += m0*m1 (for each entry in the block) - acc_offset += (*m0)*(*m1); + + // acc_offset += m0*m1 (avoid reloading from RAM) + acc_offset = _mm_fmadd_ss( _mm256_castps256_ps128( m0v ), _mm256_castps256_ps128( m1v ), acc_offset ); } // Return horizontal sum of the acc vector @@ -1772,7 +1775,7 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - sumf = _mm_cvtss_f32( res ) + acc_offset * QK; + sumf = _mm_cvtss_f32( res ) + _mm_cvtss_f32( acc_offset )* QK; #else #error "not implemented for QK" #endif