mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-02 15:02:47 +01:00
metal : add F32 -> Q4_0 copy kernel
This commit is contained in:
parent
7864a2cd9b
commit
9d69ecc0c9
16
ggml-metal.m
16
ggml-metal.m
@ -119,6 +119,10 @@ struct ggml_metal_context {
|
|||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
|
GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
|
||||||
|
GGML_METAL_DECL_KERNEL(cpy_f32_q4_0);
|
||||||
|
//GGML_METAL_DECL_KERNEL(cpy_f32_q4_1);
|
||||||
|
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
|
||||||
|
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||||
GGML_METAL_DECL_KERNEL(concat);
|
GGML_METAL_DECL_KERNEL(concat);
|
||||||
GGML_METAL_DECL_KERNEL(sqr);
|
GGML_METAL_DECL_KERNEL(sqr);
|
||||||
@ -326,6 +330,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
|
GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
|
||||||
|
GGML_METAL_ADD_KERNEL(cpy_f32_q4_0);
|
||||||
|
//GGML_METAL_ADD_KERNEL(cpy_f32_q4_1);
|
||||||
|
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
|
||||||
|
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||||
GGML_METAL_ADD_KERNEL(concat);
|
GGML_METAL_ADD_KERNEL(concat);
|
||||||
GGML_METAL_ADD_KERNEL(sqr);
|
GGML_METAL_ADD_KERNEL(sqr);
|
||||||
@ -428,6 +436,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
|
GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
|
||||||
|
GGML_METAL_DEL_KERNEL(cpy_f32_q4_0);
|
||||||
|
//GGML_METAL_DEL_KERNEL(cpy_f32_q4_1);
|
||||||
|
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
|
||||||
|
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||||
GGML_METAL_DEL_KERNEL(concat);
|
GGML_METAL_DEL_KERNEL(concat);
|
||||||
GGML_METAL_DEL_KERNEL(sqr);
|
GGML_METAL_DEL_KERNEL(sqr);
|
||||||
@ -1565,6 +1577,10 @@ void ggml_metal_graph_compute(
|
|||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
|
||||||
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
|
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
|
||||||
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break;
|
||||||
|
//case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break;
|
||||||
|
//case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break;
|
||||||
|
//case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
};
|
};
|
||||||
} break;
|
} break;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
using namespace metal;
|
using namespace metal;
|
||||||
|
|
||||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||||
|
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||||
|
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
#define QR4_0 2
|
#define QR4_0 2
|
||||||
@ -1518,6 +1519,73 @@ kernel void kernel_cpy_f32_q8_0(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_cpy_f32_q4_0(
|
||||||
|
device const float * src0,
|
||||||
|
device void * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = tgpig[2];
|
||||||
|
const int64_t i02 = tgpig[1];
|
||||||
|
const int64_t i01 = tgpig[0];
|
||||||
|
|
||||||
|
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
|
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||||
|
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||||
|
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||||
|
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
|
||||||
|
|
||||||
|
device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
|
||||||
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
|
||||||
|
float amax = 0.0f; // absolute max
|
||||||
|
float max = 0.0f;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK4_0; j++) {
|
||||||
|
const float v = src[j];
|
||||||
|
if (amax < fabs(v)) {
|
||||||
|
amax = fabs(v);
|
||||||
|
max = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = max / -8;
|
||||||
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
|
dst_data[i00/QK4_0].d = d;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK4_0/2; ++j) {
|
||||||
|
const float x0 = src[0 + j]*id;
|
||||||
|
const float x1 = src[QK4_0/2 + j]*id;
|
||||||
|
|
||||||
|
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
|
||||||
|
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
|
||||||
|
|
||||||
|
dst_data[i00/QK4_0].qs[j] = xi0;
|
||||||
|
dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_concat(
|
kernel void kernel_concat(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
|
Loading…
Reference in New Issue
Block a user