From 04aaae1d79482cad2564412f3b32e70298ac7789 Mon Sep 17 00:00:00 2001 From: Yann Follet <131855179+YannFollet@users.noreply.github.com> Date: Fri, 28 Apr 2023 19:59:48 +0800 Subject: [PATCH] add avx2 for dot_q8_0_q8_0, 2x faster than scalar (#1211) --- ggml.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ggml.c b/ggml.c index 3422a9448..1fbf2955d 100644 --- a/ggml.c +++ b/ggml.c @@ -3626,6 +3626,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate + acc = _mm256_fmadd_ps( d, q, acc ); + } + + *s = hsum_float_8(acc); #else // scalar float sumf = 0.0;