mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 15:18:26 +01:00
ggml : remove assert for AArch64 GEMV and GEMM Q4 kernels (#9217)
* ggml : remove assert for AArch64 GEMV and GEMM Q4 kernels * added fallback mechanism when the offline re-quantized model is not optimized for the underlying target. * fix for build errors * remove prints from the low-level code * Rebase to the latest upstream
This commit is contained in:
parent
afbbfaa537
commit
1e43630218
@ -1,4 +1,7 @@
|
|||||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
#define GGML_COMMON_IMPL_C
|
#define GGML_COMMON_IMPL_C
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
@ -595,6 +598,15 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
|
|||||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
|
||||||
|
static int sve_lane_count(void) {
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
return ggml_sve_cnt_b;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
@ -614,16 +626,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -680,7 +684,9 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
||||||
: "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
|
: "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
|
||||||
);
|
);
|
||||||
#else
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
float sumf[4];
|
float sumf[4];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -704,7 +710,6 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -726,13 +731,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -794,11 +794,9 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
||||||
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
|
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
|
||||||
);
|
);
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
return;
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
}
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
"performance");
|
|
||||||
#else
|
|
||||||
float sumf[4];
|
float sumf[4];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -822,7 +820,6 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -844,8 +841,9 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -910,24 +908,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
#endif // #if defined(__ARM_FEATURE_SVE)
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
|
||||||
"performance");
|
|
||||||
}
|
|
||||||
else if (ggml_cpu_has_neon()) {
|
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
|
||||||
"quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
GGML_ASSERT(ggml_cpu_has_sve() &&
|
|
||||||
"__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
|
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
|
||||||
"performance");
|
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
// Lookup table to convert signed nibbles to signed bytes
|
// Lookup table to convert signed nibbles to signed bytes
|
||||||
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
|
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
|
||||||
@ -1018,7 +999,9 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
_mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
|
_mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
return;
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||||
|
{
|
||||||
float sumf[8];
|
float sumf[8];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -1042,7 +1025,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
}
|
}
|
||||||
#endif
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -1065,16 +1048,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -1530,7 +1505,10 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
: [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
|
: [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||||
);
|
);
|
||||||
#else
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
{
|
||||||
float sumf[4][4];
|
float sumf[4][4];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -1563,7 +1541,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -1586,13 +1564,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -1988,11 +1961,9 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
: [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
|
: [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||||
);
|
);
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
return;
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
}
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
"performance");
|
|
||||||
#else
|
|
||||||
float sumf[4][4];
|
float sumf[4][4];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -2025,7 +1996,6 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -2048,8 +2018,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -2459,25 +2430,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
|
||||||
"performance");
|
|
||||||
}
|
|
||||||
else if (ggml_cpu_has_neon()) {
|
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
|
||||||
"quantization format for optimal performance");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
GGML_ASSERT(ggml_cpu_has_sve() &&
|
|
||||||
"__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
|
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
|
||||||
"performance");
|
|
||||||
#elif defined(__AVX2__) || defined(__AVX512F__)
|
#elif defined(__AVX2__) || defined(__AVX512F__)
|
||||||
|
{
|
||||||
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
|
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
|
||||||
const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
|
const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
|
||||||
int64_t b_nb = n / QK4_0;
|
int64_t b_nb = n / QK4_0;
|
||||||
@ -2492,7 +2447,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
__m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
|
__m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
|
||||||
int64_t xstart = 0;
|
int64_t xstart = 0;
|
||||||
int anr = nr - nr%16; // Used to align nr with boundary of 16
|
int anr = nr - nr%16; // Used to align nr with boundary of 16
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
int anc = nc - nc%16; // Used to align nc with boundary of 16
|
int anc = nc - nc%16; // Used to align nc with boundary of 16
|
||||||
// Mask to mask out nibbles from packed bytes expanded to 512 bit length
|
// Mask to mask out nibbles from packed bytes expanded to 512 bit length
|
||||||
const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
|
const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
|
||||||
@ -2891,7 +2846,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
xstart = anc/8;
|
xstart = anc/8;
|
||||||
y = 0;
|
y = 0;
|
||||||
}
|
}
|
||||||
#endif // __AVX512F__
|
#endif // __AVX512F__
|
||||||
|
|
||||||
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
|
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
|
||||||
|
|
||||||
@ -3225,7 +3180,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||||
float sumf[4][8];
|
float sumf[4][8];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
|
||||||
@ -3258,5 +3215,4 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user