From 6da9021ab4a81d6e492d4ae99cfcf16886121d99 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 1 Nov 2024 09:14:54 +0200
Subject: [PATCH] examples : add idle tool for investigating GPU idle overhead

---
 Makefile                     |   1 +
 examples/CMakeLists.txt      |   1 +
 examples/idle/CMakeLists.txt |   5 ++
 examples/idle/README.md      |   3 +
 examples/idle/idle.cpp       | 112 +++++++++++++++++++++++++++++++++++
 5 files changed, 122 insertions(+)
 create mode 100644 examples/idle/CMakeLists.txt
 create mode 100644 examples/idle/README.md
 create mode 100644 examples/idle/idle.cpp
diff --git a/Makefile b/Makefile
index 295522ba3..ccf595bb5 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,7 @@ BUILD_TARGETS = \
 	llama-gguf-hash \
 	llama-gguf-split \
 	llama-gritlm \
+	llama-idle \
 	llama-imatrix \
 	llama-infill \
 	llama-llava-cli \
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 66cfab2c3..abfc51fb1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -30,6 +30,7 @@ else()
     add_subdirectory(gguf-split)
     add_subdirectory(gguf)
     add_subdirectory(gritlm)
+    add_subdirectory(idle)
     add_subdirectory(imatrix)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
diff --git a/examples/idle/CMakeLists.txt b/examples/idle/CMakeLists.txt
new file mode 100644
index 000000000..d5018fec4
--- /dev/null
+++ b/examples/idle/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-idle)
+add_executable(${TARGET} idle.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/idle/README.md b/examples/idle/README.md
new file mode 100644
index 000000000..0aa3625f2
--- /dev/null
+++ b/examples/idle/README.md
@@ -0,0 +1,3 @@
+# llama.cpp/example/idle
+
+
diff --git a/examples/idle/idle.cpp b/examples/idle/idle.cpp
new file mode 100644
index 000000000..8199cf7b1
--- /dev/null
+++ b/examples/idle/idle.cpp
@@ -0,0 +1,112 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+
+static void print_usage(int /*argc*/, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // we need just a dummy token to evaluate
+    std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx   = 512;
+    ctx_params.n_batch = 512;
+    ctx_params.no_perf = false;
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+
+    const int n_iters = 10;
+
+    // warm-up
+    llama_decode(ctx, batch);
+    llama_kv_cache_clear (ctx);
+    llama_kv_cache_update(ctx);
+    llama_synchronize    (ctx);
+
+    for (int64_t t_pause_ms = 200; t_pause_ms <= 1800; t_pause_ms += 200) {
+        double t_sum_us  = 0.0;
+        double t_sum2_us = 0.0;
+
+        for (int i = 0; i < n_iters; i++) {
+            // this pause is important - it simulates "idle GPU"
+            std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
+
+            const int64_t t_start_us = llama_time_us();
+
+            // this should take constant time
+            llama_decode(ctx, batch);
+            llama_synchronize(ctx);
+
+            const int64_t t_end_us = llama_time_us();
+
+            const double t_cur_us = t_end_us - t_start_us;
+
+#if 0
+            // print individual decode times
+            printf("  - decode time: %8.2f ms\n", t_cur_us / 1000);
+#endif
+
+            t_sum_us  += t_cur_us;
+            t_sum2_us += t_cur_us * t_cur_us;
+
+            llama_kv_cache_clear (ctx);
+            llama_kv_cache_update(ctx);
+            llama_synchronize    (ctx); // just in case
+        }
+
+        const double t_avg_us = t_sum_us / n_iters;
+        const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
+
+        printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
+        fflush(stdout);
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}