mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-15 23:00:46 +01:00
iq2_xs: WIP Metal
This commit is contained in:
parent
9b6e38d8c0
commit
0aacd55159
42
ggml-metal.m
42
ggml-metal.m
@ -89,6 +89,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(group_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
@ -108,6 +109,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
|
||||
@ -124,6 +126,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
@ -137,6 +140,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
|
||||
@ -150,6 +154,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope_f16);
|
||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||
@ -385,6 +390,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(group_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
@ -404,6 +410,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
|
||||
@ -420,6 +427,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
@ -434,6 +442,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
|
||||
@ -447,6 +456,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
|
||||
}
|
||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||
@ -513,6 +523,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(group_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
@ -532,6 +543,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
|
||||
@ -548,6 +560,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
@ -562,6 +575,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
|
||||
@ -575,6 +589,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
|
||||
}
|
||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||
@ -1557,6 +1572,7 @@ bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
||||
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
|
||||
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@ -1675,6 +1691,12 @@ bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
@ -1708,12 +1730,12 @@ bool ggml_metal_graph_compute(
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
//src0t == GGML_TYPE_IQ2_XXS ||
|
||||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS) {
|
||||
[encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0];
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
@ -1806,6 +1828,7 @@ bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
|
||||
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
|
||||
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@ -1927,6 +1950,12 @@ bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
@ -1976,12 +2005,12 @@ bool ggml_metal_graph_compute(
|
||||
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
//src2t == GGML_TYPE_IQ2_XXS ||
|
||||
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ2_XXS) {
|
||||
[encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0];
|
||||
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
||||
const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
@ -2022,6 +2051,7 @@ bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
||||
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
||||
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
|
||||
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
|
239
ggml-metal.metal
239
ggml-metal.metal
@ -2452,6 +2452,13 @@ typedef struct {
|
||||
} block_iq2_xxs;
|
||||
// 66 bytes / block for QK_K = 256, so 2.0625 bpw
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint16_t qs[QK_K/8];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_xs;
|
||||
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
void kernel_mul_mv_q2_K_f32_impl(
|
||||
@ -3820,6 +3827,149 @@ kernel void kernel_mul_mv_iq2_xxs_f32(
|
||||
kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq2_xs_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
||||
threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256);
|
||||
{
|
||||
int nval = 4;
|
||||
int pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
|
||||
nval = 2;
|
||||
pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq2_xs * xr = x + ibl;
|
||||
device const uint16_t * q2 = xr->qs + 4 * ib;
|
||||
device const uint8_t * sc = xr->scales + ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const uint8_t ls1 = sc[0] & 0xf;
|
||||
const uint8_t ls2 = sc[0] >> 4;
|
||||
const float d1 = db * (0.5f + ls1);
|
||||
const float d2 = db * (0.5f + ls2);
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
|
||||
const uint8_t signs = shared_signs[(q2[l] >> 9)];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
|
||||
const uint8_t signs = shared_signs[(q2[l] >> 9)];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
sumf[row] += d1 * sum1 + d2 * sum2;
|
||||
|
||||
dh += nb*sizeof(block_iq2_xxs)/2;
|
||||
q2 += nb*sizeof(block_iq2_xxs)/2;
|
||||
sc += nb*sizeof(block_iq2_xxs);
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq2_xs_f32")]]
|
||||
kernel void kernel_mul_mv_iq2_xs_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
@ -4116,6 +4266,27 @@ void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + (q2[2*il+0] & 511));
|
||||
uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xxs_grid + (q2[2*il+1] & 511));
|
||||
signs = ksigns_iq2xs[q2[2*il+1] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
device const void * src0,
|
||||
@ -4656,6 +4827,7 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
|
||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@ -4693,6 +4865,7 @@ template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@ -4742,6 +4915,7 @@ template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mu
|
||||
template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@ -5579,3 +5753,68 @@ kernel void kernel_mul_mv_id_iq2_xxs_f32(
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq2_xs_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq2_xs_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq2_xs_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user