From 481044d50cfe8eaa6cd0c1a1b445680e4b0b3ebc Mon Sep 17 00:00:00 2001 From: Cameron Kaiser Date: Fri, 24 Mar 2023 08:19:26 -0700 Subject: [PATCH] additional optimizations for POWER9 (#454) --- Makefile | 3 +- ggml.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 91eebaebd..e8b128cb8 100644 --- a/Makefile +++ b/Makefile @@ -156,7 +156,8 @@ endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) - CFLAGS += -mpower9-vector + CFLAGS += -mcpu=power9 + CXXFLAGS += -mcpu=power9 endif # Require c++23's std::byteswap for big-endian support. ifeq ($(UNAME_M),ppc64) diff --git a/ggml.c b/ggml.c index 800390a88..92b857a00 100644 --- a/ggml.c +++ b/ggml.c @@ -175,6 +175,39 @@ typedef double ggml_float; #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + #else // FP16 <-> FP32 @@ -272,6 +305,7 @@ static float table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { @@ -462,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); -#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) +#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; @@ -472,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { uint8_t pp[QK/2]; #endif -#if __ARM_NEON +#if defined(__POWER9_VECTOR__) +#if QK == 32 + const vector float v85 = vec_splats(8.5f); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]); + //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]); + amaxv[0] = vec_max(amaxv[0], amaxv[2]); + amaxv[4] = vec_max(amaxv[4], amaxv[6]); + //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]); + amaxv[0] = vec_max(amaxv[0], amaxv[4]); + + amax = MAX( + MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 3) - 1); + const float id = d ? 1.0/d : 0.0; + + *(float *)pd = d; + pd += bs; + + const vector float vid = vec_splats(id); + for (int l = 0; l < 8; l++) { + const vector float vf = vec_madd(srcv[l], vid, v85); + const vector signed int vi = vec_signed(vf); + + pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4); + pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4); + } + + //memcpy(pb, pp, sizeof(pp)); + pb += bs; + } +#else +#error "not implemented for QK" +#endif +#elif __ARM_NEON #if QK == 32 for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max