mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 22:08:46 +01:00
ggml : add Q5 WASM SIMD + GGML_FTYPE
This commit is contained in:
parent
f0d70f147d
commit
6bc4400e67
162
ggml.c
162
ggml.c
@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
|
|||||||
// precomputed f32 table for f16 (256 KB)
|
// precomputed f32 table for f16 (256 KB)
|
||||||
static float table_f32_f16[1 << 16];
|
static float table_f32_f16[1 << 16];
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||||
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
||||||
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
||||||
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
||||||
@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|||||||
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
||||||
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
||||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
||||||
const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
|
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
|
||||||
|
|
||||||
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
|
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
|
||||||
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
|
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
|
||||||
@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv);
|
*s = vaddvq_f32(sumv);
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
v128_t sumv = wasm_f32x4_splat(0.0f);
|
||||||
|
|
||||||
|
uint64_t tmp[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const block_q5_0 * restrict x0 = &x[i];
|
||||||
|
const block_q8_0 * restrict y0 = &y[i];
|
||||||
|
|
||||||
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||||
|
const v128_t s16b = wasm_i8x16_splat(0x10);
|
||||||
|
|
||||||
|
// extract the 5th bit
|
||||||
|
uint32_t qh;
|
||||||
|
memcpy(&qh, x0->qh, sizeof(qh));
|
||||||
|
|
||||||
|
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
||||||
|
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
||||||
|
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
||||||
|
tmp[3] = table_b2b_u[(qh >> 24) ];
|
||||||
|
|
||||||
|
const v128_t qhl = wasm_v128_load(tmp + 0);
|
||||||
|
const v128_t qhh = wasm_v128_load(tmp + 2);
|
||||||
|
|
||||||
|
const v128_t v0 = wasm_v128_load(x0->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||||
|
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||||
|
|
||||||
|
// interleave
|
||||||
|
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||||
|
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||||
|
|
||||||
|
// add high bit and sub 16
|
||||||
|
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
|
||||||
|
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const v128_t v1l = wasm_v128_load(y0->qs);
|
||||||
|
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
||||||
|
|
||||||
|
// int8x16 -> int16x8
|
||||||
|
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
||||||
|
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
||||||
|
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
||||||
|
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
||||||
|
|
||||||
|
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
||||||
|
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
||||||
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||||
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||||
|
|
||||||
|
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||||
|
wasm_i32x4_add(
|
||||||
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||||
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||||
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||||
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||||
|
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
// Initialize accumulator with zeros
|
// Initialize accumulator with zeros
|
||||||
__m256 acc = _mm256_setzero_ps();
|
__m256 acc = _mm256_setzero_ps();
|
||||||
@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv) + summs;
|
*s = vaddvq_f32(sumv) + summs;
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
v128_t sumv = wasm_f32x4_splat(0.0f);
|
||||||
|
|
||||||
|
float summs = 0.0f;
|
||||||
|
|
||||||
|
uint64_t tmp[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const block_q5_1 * restrict x0 = &x[i];
|
||||||
|
const block_q8_1 * restrict y0 = &y[i];
|
||||||
|
|
||||||
|
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
|
||||||
|
|
||||||
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||||
|
|
||||||
|
// extract the 5th bit
|
||||||
|
uint32_t qh;
|
||||||
|
memcpy(&qh, x0->qh, sizeof(qh));
|
||||||
|
|
||||||
|
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
||||||
|
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
||||||
|
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
||||||
|
tmp[3] = table_b2b_u[(qh >> 24) ];
|
||||||
|
|
||||||
|
const v128_t qhl = wasm_v128_load(tmp + 0);
|
||||||
|
const v128_t qhh = wasm_v128_load(tmp + 2);
|
||||||
|
|
||||||
|
const v128_t v0 = wasm_v128_load(x0->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||||
|
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||||
|
|
||||||
|
static bool x = true;
|
||||||
|
|
||||||
|
// interleave
|
||||||
|
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||||
|
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||||
|
|
||||||
|
// add high bit
|
||||||
|
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
|
||||||
|
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const v128_t v1l = wasm_v128_load(y0->qs);
|
||||||
|
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
||||||
|
|
||||||
|
// int8x16 -> int16x8
|
||||||
|
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
||||||
|
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
||||||
|
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
||||||
|
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
||||||
|
|
||||||
|
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
||||||
|
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
||||||
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||||
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||||
|
|
||||||
|
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||||
|
wasm_i32x4_add(
|
||||||
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||||
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||||
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||||
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||||
|
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
// Initialize accumulator with zeros
|
// Initialize accumulator with zeros
|
||||||
__m256 acc = _mm256_setzero_ps();
|
__m256 acc = _mm256_setzero_ps();
|
||||||
@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
|
|||||||
return GGML_IS_QUANTIZED[type];
|
return GGML_IS_QUANTIZED[type];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
||||||
|
|
||||||
|
switch (ftype) {
|
||||||
|
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||||
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
||||||
|
|
||||||
|
return wtype;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||||
return tensor->nb[0] > tensor->nb[1];
|
return tensor->nb[0] > tensor->nb[1];
|
||||||
}
|
}
|
||||||
|
17
ggml.h
17
ggml.h
@ -232,6 +232,20 @@ extern "C" {
|
|||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// model file types
|
||||||
|
enum ggml_ftype {
|
||||||
|
GGML_FTYPE_UNKNOWN = -1,
|
||||||
|
GGML_FTYPE_ALL_F32 = 0,
|
||||||
|
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
enum ggml_op {
|
enum ggml_op {
|
||||||
GGML_OP_NONE = 0,
|
GGML_OP_NONE = 0,
|
||||||
@ -385,6 +399,9 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||||
|
|
||||||
|
// TODO: temporary until model loading of ggml examples is refactored
|
||||||
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
Loading…
Reference in New Issue
Block a user