common : add -dkvc arg for enabling kv cache dumps

2024-12-28 15:18:26 +01:00 · 2023-11-23 18:47:56 +02:00 · 2023-11-23 18:47:56 +02:00 · f8e9f11428
commit f8e9f11428
parent 5df7d06c42
4 changed files with 14 additions and 5 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -496,6 +496,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.chatml = true;
        } else if (arg == "--infill") {
            params.infill = true;
+        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+            params.dump_kv_cache = true;
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@ -836,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // GGML_USE_CUBLAS
 #endif
    printf("  --verbose-prompt      print prompt before generation\n");
+    printf("  -dkvc, --dump-kv-cache\n");
+    printf("                        verbose print of the KV cache\n");
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
--- a/common/common.h
+++ b/common/common.h
@ -122,6 +122,7 @@ struct gpt_params {
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
+    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes

    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
    // insert new requests as soon as the previous one is done
    const bool cont_batching = params.cont_batching;

+    const bool dump_kv_cache = params.dump_kv_cache;
+
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("parallel", "log"));
    LOG_TEE("Log start\n");
@ -203,8 +205,10 @@ int main(int argc, char ** argv) {
    LOG_TEE("Processing requests ...\n\n");

    while (true) {
+        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
            dump_kv_cache_view_seqs(kvc_view, 40);
+        }

        llama_batch_clear(batch);

--- a/llama.h
+++ b/llama.h
@ -400,13 +400,13 @@ extern "C" {
        llama_seq_id * cells_sequences;
    };

-    // Create an empty KV cache view.
+    // Create an empty KV cache view. (use only for debugging purposes)
    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);

-    // Free a KV cache view.
+    // Free a KV cache view. (use only for debugging purposes)
    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);

-    // Update the KV cache view structure with the current state of the KV cache.
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);

    // Returns the number of tokens in the KV cache (slow, use only for debug)