diff --git a/common/common.cpp b/common/common.cpp index 1dcc235ea..43c374d5c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -498,6 +498,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.infill = true; } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { params.dump_kv_cache = true; + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; } else if (arg == "--multiline-input") { params.multiline_input = true; } else if (arg == "--simple-io") { @@ -840,6 +842,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --verbose-prompt print prompt before generation\n"); printf(" -dkvc, --dump-kv-cache\n"); printf(" verbose print of the KV cache\n"); + printf(" -nkvo, --no-kv-offload\n"); + printf(" disable KV offload\n"); printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); @@ -924,6 +928,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.offload_kqv = !params.no_kv_offload; return cparams; } diff --git a/common/common.h b/common/common.h index 2f6fe48ab..2664c8fc1 100644 --- a/common/common.h +++ b/common/common.h @@ -123,6 +123,7 @@ struct gpt_params { bool verbose_prompt = false; // print prompt tokens before generation bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes + bool no_kv_offload = false; // disable KV offloading // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector diff --git a/llama.cpp b/llama.cpp index 8a2946fe7..357f19bb0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1245,8 +1245,7 @@ struct llama_cparams { float yarn_beta_slow; bool mul_mat_q; - bool offload_k; - bool offload_v; + bool offload_kqv; }; @@ -1526,8 +1525,7 @@ static bool llama_kv_cache_init( ggml_type wtype, uint32_t n_ctx, int n_gpu_layers, - bool offload_k, - bool offload_v) { + bool offload) { const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_layer = hparams.n_layer; @@ -1574,11 +1572,9 @@ static bool llama_kv_cache_init( cache.v_l.push_back(v); #ifdef GGML_USE_CUBLAS if (i >= i_gpu_start) { - if (offload_k) { + if (offload) { ggml_cuda_assign_buffers_no_scratch(k); vram_kv_cache += ggml_nbytes(k); - } - if (offload_v) { ggml_cuda_assign_buffers_no_scratch(v); vram_kv_cache += ggml_nbytes(v); } @@ -5101,6 +5097,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_NOP, OFFLOAD_FUNC, OFFLOAD_FUNC_FRC, // force offload + OFFLOAD_FUNC_KQV, OFFLOAD_FUNC_NR, OFFLOAD_FUNC_EMB, OFFLOAD_FUNC_OUT, @@ -5204,38 +5201,38 @@ static const std::unordered_map k_offload_map { "attn_norm", OFFLOAD_FUNC }, { "attn_norm_2", OFFLOAD_FUNC }, - { "wqkv", OFFLOAD_FUNC }, - { "bqkv", OFFLOAD_FUNC }, - { "wqkv_clamped", OFFLOAD_FUNC }, + { "wqkv", OFFLOAD_FUNC_KQV }, + { "bqkv", OFFLOAD_FUNC_KQV }, + { "wqkv_clamped", OFFLOAD_FUNC_KQV }, - { "tmpk", OFFLOAD_FUNC }, - { "tmpq", OFFLOAD_FUNC }, - { "tmpv", OFFLOAD_FUNC }, - { "Kcur", OFFLOAD_FUNC }, - { "Qcur", OFFLOAD_FUNC }, - { "Vcur", OFFLOAD_FUNC }, + { "tmpk", OFFLOAD_FUNC_KQV }, + { "tmpq", OFFLOAD_FUNC_KQV }, + { "tmpv", OFFLOAD_FUNC_KQV }, + { "Kcur", OFFLOAD_FUNC_KQV }, + { "Qcur", OFFLOAD_FUNC_KQV }, + { "Vcur", OFFLOAD_FUNC_KQV }, - { "krot", OFFLOAD_FUNC }, - { "qrot", OFFLOAD_FUNC }, - { "kpass", OFFLOAD_FUNC }, - { "qpass", OFFLOAD_FUNC }, - { "krotated", OFFLOAD_FUNC }, - { "qrotated", OFFLOAD_FUNC }, + { "krot", OFFLOAD_FUNC_KQV }, + { "qrot", OFFLOAD_FUNC_KQV }, + { "kpass", OFFLOAD_FUNC_KQV }, + { "qpass", OFFLOAD_FUNC_KQV }, + { "krotated", OFFLOAD_FUNC_KQV }, + { "qrotated", OFFLOAD_FUNC_KQV }, - { "q", OFFLOAD_FUNC }, - { "k", OFFLOAD_FUNC }, - { "kq", OFFLOAD_FUNC }, - { "kq_scaled", OFFLOAD_FUNC }, - { "kq_scaled_alibi", OFFLOAD_FUNC }, - { "kq_masked", OFFLOAD_FUNC }, - { "kq_soft_max", OFFLOAD_FUNC }, - { "kq_soft_max_ext", OFFLOAD_FUNC }, - { "v", OFFLOAD_FUNC }, - { "kqv", OFFLOAD_FUNC }, - { "kqv_merged", OFFLOAD_FUNC }, - { "kqv_merged_cont", OFFLOAD_FUNC }, - { "kqv_wo", OFFLOAD_FUNC }, - { "kqv_out", OFFLOAD_FUNC }, + { "q", OFFLOAD_FUNC_KQV }, + { "k", OFFLOAD_FUNC_KQV }, + { "kq", OFFLOAD_FUNC_KQV }, + { "kq_scaled", OFFLOAD_FUNC_KQV }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, + { "kq_masked", OFFLOAD_FUNC_KQV }, + { "kq_soft_max", OFFLOAD_FUNC_KQV }, + { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, + { "v", OFFLOAD_FUNC_KQV }, + { "kqv", OFFLOAD_FUNC_KQV }, + { "kqv_merged", OFFLOAD_FUNC_KQV }, + { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, + { "kqv_wo", OFFLOAD_FUNC_KQV }, + { "kqv_out", OFFLOAD_FUNC_KQV }, { "ffn_inp", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, @@ -5429,11 +5426,13 @@ static struct ggml_cgraph * llama_build_graph( #ifdef GGML_USE_CUBLAS { OFFLOAD_FUNC, "GPU (CUDA)" }, { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, + { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, #else { OFFLOAD_FUNC, "CPU" }, { OFFLOAD_FUNC_FRC, "CPU" }, + { OFFLOAD_FUNC_KQV, "CPU" }, { OFFLOAD_FUNC_NR, "CPU" }, { OFFLOAD_FUNC_EMB, "CPU" }, #endif // GGML_USE_CUBLAS @@ -5458,7 +5457,6 @@ static struct ggml_cgraph * llama_build_graph( switch (func_e) { case OFFLOAD_FUNC_NOP: case OFFLOAD_FUNC_OUT: - case OFFLOAD_FUNC_FRC: break; case OFFLOAD_FUNC: if (n_gpu_layers < n_layer) { @@ -5467,6 +5465,21 @@ static struct ggml_cgraph * llama_build_graph( } } break; + case OFFLOAD_FUNC_FRC: + if (!lctx.cparams.offload_kqv) { + func_e = OFFLOAD_FUNC_NOP; + } break; + case OFFLOAD_FUNC_KQV: + if (!lctx.cparams.offload_kqv) { + func_e = OFFLOAD_FUNC_NOP; + } else { + if (n_gpu_layers < n_layer) { + if (il < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + } + break; case OFFLOAD_FUNC_NR: if (n_gpu_layers <= n_layer + 0) { func_e = OFFLOAD_FUNC_NOP; @@ -5493,6 +5506,7 @@ static struct ggml_cgraph * llama_build_graph( case OFFLOAD_FUNC_NOP: case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQV: case OFFLOAD_FUNC_FRC: case OFFLOAD_FUNC_NR: case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; @@ -8567,8 +8581,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, - /*.offload_k =*/ true, - /*.offload_q =*/ true, + /*.offload_kqv =*/ true, }; return result; @@ -8685,8 +8698,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.mul_mat_q = params.mul_mat_q; - cparams.offload_k = params.offload_k; - cparams.offload_v = params.offload_v; + cparams.offload_kqv = params.offload_kqv; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -8724,7 +8736,7 @@ struct llama_context * llama_new_context_with_model( // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_k, cparams.offload_v)) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; diff --git a/llama.h b/llama.h index 3e2ad0560..634969b34 100644 --- a/llama.h +++ b/llama.h @@ -192,12 +192,11 @@ extern "C" { uint32_t yarn_orig_ctx; // YaRN original context size // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) - bool f16_kv; // use fp16 for KV cache, fp32 otherwise - bool logits_all; // the llama_eval() call computes all logits, not just the last one - bool embedding; // embedding mode only - bool offload_k; - bool offload_v; + bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) + bool f16_kv; // use fp16 for KV cache, fp32 otherwise + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool embedding; // embedding mode only + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU }; // model quantization parameters