From e89597c062e7119bcfbca60585563272d128a1c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Nov 2023 12:44:47 +0200 Subject: [PATCH] metal : implement soft_max_ext --- ggml-metal.m | 20 +++++++++++++++----- ggml-metal.metal | 26 ++++++++++++++++---------- ggml.c | 18 ++++++++++++++++-- ggml.h | 8 ++++++++ llama.cpp | 34 ++++++++++++++++++++-------------- 5 files changed, 75 insertions(+), 31 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index d52a1c3c4..0b468bea0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1036,11 +1036,21 @@ void ggml_metal_graph_compute( nth /= 2; [encoder setComputePipelineState:ctx->pipeline_soft_max]; } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + + const float scale = ((float *) dst->op_params)[0]; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (id_src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:nil offset:0 atIndex:1]; + } + + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; diff --git a/ggml-metal.metal b/ggml-metal.metal index 5d1357cd7..61eb0c403 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -180,10 +180,12 @@ kernel void kernel_gelu( kernel void kernel_soft_max( device const float * src0, + device const float * src1, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant float & scale, threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], @@ -194,14 +196,15 @@ kernel void kernel_soft_max( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * pmask = src1 ? src1 + i01*ne00 : nullptr; + device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY; + float lmax = (tpitg < ne00) ? (psrc0[tpitg]*scale + (pmask ? pmask[tpitg] : 0.0f)) : -INFINITY; for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)); } float max = simd_max(lmax); @@ -225,7 +228,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp(psrc0[i00] - max); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max); lsum += exp_psrc0; // Remember the result of exp here. exp is expensive, so we really do not // wish to compute it twice. @@ -257,10 +260,12 @@ kernel void kernel_soft_max( kernel void kernel_soft_max_4( device const float * src0, + device const float * src1, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant float & scale, threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], @@ -271,14 +276,15 @@ kernel void kernel_soft_max_4( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max - float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY; + float4 lmax4 = tpitg < ne00/4 ? (psrc4[tpitg]*scale + (pmask ? pmask[tpitg] : 0.0f)) : -INFINITY; for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -303,7 +309,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp(psrc4[i00] - max); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } diff --git a/ggml.c b/ggml.c index c522a101f..a0b04cbeb 100644 --- a/ggml.c +++ b/ggml.c @@ -4826,6 +4826,8 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace( static struct ggml_tensor * ggml_soft_max_impl( struct ggml_context * ctx, struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale, bool inplace) { bool is_node = false; @@ -4835,9 +4837,13 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + float params[] = { scale }; + ggml_set_op_params(result, params, sizeof(params)); + result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = mask; return result; } @@ -4845,13 +4851,21 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_tensor * ggml_soft_max( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, false); + return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false); } struct ggml_tensor * ggml_soft_max_inplace( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, true); + return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true); +} + +struct ggml_tensor * ggml_soft_max_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale) { + return ggml_soft_max_impl(ctx, a, mask, scale, false); } // ggml_soft_max_back diff --git a/ggml.h b/ggml.h index 4d6d4edfd..2f6787d4e 100644 --- a/ggml.h +++ b/ggml.h @@ -1282,6 +1282,14 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // fused soft_max(a*scale + mask) + // mask is optional + GGML_API struct ggml_tensor * ggml_soft_max_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale); + GGML_API struct ggml_tensor * ggml_soft_max_back( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index cb544228b..ba837e26f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3705,23 +3705,29 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - kq = ggml_scale(ctx, kq, kq_scale); - cb(kq, "kq_scaled", il); - + // TODO: !!!!!!!!! if (max_alibi_bias > 0.0f) { - // TODO: n_head or n_head_kv - // TODO: K-shift is likely not working - // TODO: change to ggml_add - kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (max_alibi_bias > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to ggml_add + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); + } + + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); + + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + } else { + kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head))); + cb(kq, "kq_soft_max_ext", il); } - kq = ggml_add(ctx, kq, kq_mask); - cb(kq, "kq_masked", il); - - kq = ggml_soft_max(ctx, kq); - cb(kq, "kq_soft_max", il); - // split cached v into n_head heads struct ggml_tensor * v = ggml_view_3d(ctx, kv.v,