mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-04 01:57:53 +01:00
llama : support quantum K cache (wip)
This commit is contained in:
parent
66aaac9867
commit
d04ee928a2
@ -1114,7 +1114,7 @@ void ggml_metal_graph_compute(
|
|||||||
!ggml_is_transposed(src1) &&
|
!ggml_is_transposed(src1) &&
|
||||||
src1t == GGML_TYPE_F32 &&
|
src1t == GGML_TYPE_F32 &&
|
||||||
ne00 % 32 == 0 && ne00 >= 64 &&
|
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||||
ne11 > ne11_mm_min) {
|
(ne11 > ne11_mm_min || ne12 > 1)) {
|
||||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||||
|
30
llama.cpp
30
llama.cpp
@ -1522,7 +1522,8 @@ struct llama_context {
|
|||||||
static bool llama_kv_cache_init(
|
static bool llama_kv_cache_init(
|
||||||
const struct llama_hparams & hparams,
|
const struct llama_hparams & hparams,
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type ktype,
|
||||||
|
ggml_type vtype,
|
||||||
uint32_t n_ctx,
|
uint32_t n_ctx,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
bool offload) {
|
bool offload) {
|
||||||
@ -1541,7 +1542,7 @@ static bool llama_kv_cache_init(
|
|||||||
cache.cells.clear();
|
cache.cells.clear();
|
||||||
cache.cells.resize(n_ctx);
|
cache.cells.resize(n_ctx);
|
||||||
|
|
||||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*n_layer*ggml_tensor_overhead());
|
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
||||||
memset(cache.buf.data, 0, cache.buf.size);
|
memset(cache.buf.data, 0, cache.buf.size);
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
@ -1566,8 +1567,8 @@ static bool llama_kv_cache_init(
|
|||||||
GGML_UNUSED(offload);
|
GGML_UNUSED(offload);
|
||||||
|
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < (int) n_layer; i++) {
|
||||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
||||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
|
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
||||||
ggml_format_name(k, "cache_k_l%d", i);
|
ggml_format_name(k, "cache_k_l%d", i);
|
||||||
ggml_format_name(v, "cache_v_l%d", i);
|
ggml_format_name(v, "cache_v_l%d", i);
|
||||||
cache.k_l.push_back(k);
|
cache.k_l.push_back(k);
|
||||||
@ -3558,8 +3559,8 @@ static void llm_build_k_shift(
|
|||||||
ggml_rope_custom_inplace(ctx,
|
ggml_rope_custom_inplace(ctx,
|
||||||
ggml_view_3d(ctx, kv.k_l[il],
|
ggml_view_3d(ctx, kv.k_l[il],
|
||||||
n_embd_head, n_head_kv, n_ctx,
|
n_embd_head, n_head_kv, n_ctx,
|
||||||
ggml_element_size(kv.k_l[il])*n_embd_head,
|
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||||
ggml_element_size(kv.k_l[il])*n_embd_gqa,
|
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||||
0),
|
0),
|
||||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
@ -3588,7 +3589,7 @@ static void llm_build_kv_store(
|
|||||||
cb(v_cur_t, "v_cur_t", il);
|
cb(v_cur_t, "v_cur_t", il);
|
||||||
|
|
||||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
||||||
(ggml_element_size(kv.k_l[il])*n_embd_gqa)*kv_head);
|
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
|
||||||
cb(k_cache_view, "k_cache_view", il);
|
cb(k_cache_view, "k_cache_view", il);
|
||||||
|
|
||||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
||||||
@ -3747,8 +3748,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|||||||
struct ggml_tensor * k =
|
struct ggml_tensor * k =
|
||||||
ggml_view_3d(ctx, kv.k_l[il],
|
ggml_view_3d(ctx, kv.k_l[il],
|
||||||
n_embd_head, n_kv, n_head_kv,
|
n_embd_head, n_kv, n_head_kv,
|
||||||
ggml_element_size(kv.k_l[il])*n_embd_gqa,
|
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||||
ggml_element_size(kv.k_l[il])*n_embd_head,
|
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||||
0);
|
0);
|
||||||
cb(k, "k", il);
|
cb(k, "k", il);
|
||||||
|
|
||||||
@ -8734,11 +8735,18 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->rng = std::mt19937(params.seed);
|
ctx->rng = std::mt19937(params.seed);
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
//const ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
|
// TODO: move as params
|
||||||
|
const ggml_type k_type = GGML_TYPE_Q4_0;
|
||||||
|
const ggml_type v_type = GGML_TYPE_F16;
|
||||||
|
|
||||||
|
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(k_type) == 0);
|
||||||
|
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(v_type) == 0);
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, k_type, v_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
||||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
Loading…
Reference in New Issue
Block a user