mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 06:10:29 +01:00
ggml : always define ggml_fp16_t as uint16_t (#5666)
* ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci
This commit is contained in:
parent
334f76fa38
commit
7e4f339c40
@ -1,3 +1,7 @@
|
|||||||
|
#include "ggml-cuda.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
@ -121,11 +125,6 @@
|
|||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
// ggml-cuda need half type so keep ggml headers include at last
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
|
||||||
|
|
||||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||||
|
|
||||||
#define CC_PASCAL 600
|
#define CC_PASCAL 600
|
||||||
|
27
ggml-impl.h
27
ggml-impl.h
@ -53,11 +53,23 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
#define GGML_FP32_TO_FP16(x) (x)
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
__fp16 tmp;
|
||||||
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||||
|
return (float)tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
ggml_fp16_t res;
|
||||||
|
__fp16 tmp = f;
|
||||||
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
|
|||||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||||
// This is also true for POWER9.
|
// This is also true for POWER9.
|
||||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
#if !defined(GGML_FP16_TO_FP32)
|
||||||
|
|
||||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
uint16_t s;
|
uint16_t s;
|
||||||
memcpy(&s, &f, sizeof(uint16_t));
|
memcpy(&s, &f, sizeof(uint16_t));
|
||||||
@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
#endif
|
||||||
|
|
||||||
|
#if !defined(GGML_FP32_TO_FP16)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
||||||
|
@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const float dmin = -y[i].d * (float)x[i].dmin;
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
const uint8_t * restrict q2 = x[i].qs;
|
const uint8_t * restrict q2 = x[i].qs;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const float dmin = -y[i].d * (float)x[i].dmin;
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
const uint8_t * restrict q2 = x[i].qs;
|
const uint8_t * restrict q2 = x[i].qs;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
||||||
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
||||||
@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
|
||||||
@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||||
|
|
||||||
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
||||||
sum_mins += y[i].d * (float)x[i].d[1] * summi;
|
sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d[0];
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
|
||||||
|
|
||||||
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
||||||
|
|
||||||
@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const int8_t * sc = x[i].scales;
|
const int8_t * sc = x[i].scales;
|
||||||
|
|
||||||
const uint8_t * restrict q5 = x[i].qs;
|
const uint8_t * restrict q5 = x[i].qs;
|
||||||
@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d;
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const int8_t * sc = x[i].scales;
|
const int8_t * sc = x[i].scales;
|
||||||
|
|
||||||
const uint8_t * restrict q5 = x[i].qs;
|
const uint8_t * restrict q5 = x[i].qs;
|
||||||
@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d_all = (float)x[i].d;
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
const uint8_t * restrict q6 = x[i].ql;
|
const uint8_t * restrict q6 = x[i].ql;
|
||||||
const uint8_t * restrict qh = x[i].qh;
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d_all = (float)x[i].d;
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
const uint8_t * restrict q6 = x[i].ql;
|
const uint8_t * restrict q6 = x[i].ql;
|
||||||
const uint8_t * restrict qh = x[i].qh;
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
for (int ib = 0; ib < nb; ib += 2) {
|
for (int ib = 0; ib < nb; ib += 2) {
|
||||||
|
|
||||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
||||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
||||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
||||||
@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||||
|
|
||||||
sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
|
sumf +=
|
||||||
|
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
||||||
|
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
6
ggml.c
6
ggml.c
@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
|
|||||||
// note: do not use these inside ggml.c
|
// note: do not use these inside ggml.c
|
||||||
// these are meant to be used via the ggml.h API
|
// these are meant to be used via the ggml.h API
|
||||||
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
||||||
return (float) GGML_FP16_TO_FP32(x);
|
return GGML_FP16_TO_FP32(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||||
@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|||||||
#define GGML_F16x8 float16x8_t
|
#define GGML_F16x8 float16x8_t
|
||||||
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
||||||
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
||||||
#define GGML_F16x8_LOAD vld1q_f16
|
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
||||||
#define GGML_F16x8_STORE vst1q_f16
|
#define GGML_F16x8_STORE vst1q_f16
|
||||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||||
#define GGML_F16x8_ADD vaddq_f16
|
#define GGML_F16x8_ADD vaddq_f16
|
||||||
@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|||||||
#define GGML_F32Cx4 float32x4_t
|
#define GGML_F32Cx4 float32x4_t
|
||||||
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
||||||
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
||||||
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
|
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
||||||
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
||||||
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||||
#define GGML_F32Cx4_ADD vaddq_f32
|
#define GGML_F32Cx4_ADD vaddq_f32
|
||||||
|
6
ggml.h
6
ggml.h
@ -315,13 +315,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
|
||||||
typedef half ggml_fp16_t;
|
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
||||||
typedef __fp16 ggml_fp16_t;
|
|
||||||
#else
|
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
#endif
|
|
||||||
|
|
||||||
// convert FP16 <-> FP32
|
// convert FP16 <-> FP32
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||||
|
Loading…
Reference in New Issue
Block a user