From 6da9021ab4a81d6e492d4ae99cfcf16886121d99 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 09:14:54 +0200 Subject: [PATCH] examples : add idle tool for investigating GPU idle overhead --- Makefile | 1 + examples/CMakeLists.txt | 1 + examples/idle/CMakeLists.txt | 5 ++ examples/idle/README.md | 3 + examples/idle/idle.cpp | 112 +++++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+) create mode 100644 examples/idle/CMakeLists.txt create mode 100644 examples/idle/README.md create mode 100644 examples/idle/idle.cpp diff --git a/Makefile b/Makefile index 295522ba3..ccf595bb5 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ BUILD_TARGETS = \ llama-gguf-hash \ llama-gguf-split \ llama-gritlm \ + llama-idle \ llama-imatrix \ llama-infill \ llama-llava-cli \ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 66cfab2c3..abfc51fb1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -30,6 +30,7 @@ else() add_subdirectory(gguf-split) add_subdirectory(gguf) add_subdirectory(gritlm) + add_subdirectory(idle) add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) diff --git a/examples/idle/CMakeLists.txt b/examples/idle/CMakeLists.txt new file mode 100644 index 000000000..d5018fec4 --- /dev/null +++ b/examples/idle/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-idle) +add_executable(${TARGET} idle.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/idle/README.md b/examples/idle/README.md new file mode 100644 index 000000000..0aa3625f2 --- /dev/null +++ b/examples/idle/README.md @@ -0,0 +1,3 @@ +# llama.cpp/example/idle + + diff --git a/examples/idle/idle.cpp b/examples/idle/idle.cpp new file mode 100644 index 000000000..8199cf7b1 --- /dev/null +++ b/examples/idle/idle.cpp @@ -0,0 +1,112 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include + +static void print_usage(int /*argc*/, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + return 1; + } + + common_init(); + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + // initialize the model + + llama_model_params model_params = common_model_params_to_llama(params); + + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + + if (model == NULL) { + LOG_ERR("%s: error: unable to load model\n" , __func__); + return 1; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + + // we need just a dummy token to evaluate + std::vector prompt_tokens(1, llama_vocab_bos(vocab)); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + ctx_params.n_batch = 512; + ctx_params.no_perf = false; + + llama_context * ctx = llama_init_from_model(model, ctx_params); + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + + const int n_iters = 10; + + // warm-up + llama_decode(ctx, batch); + llama_kv_cache_clear (ctx); + llama_kv_cache_update(ctx); + llama_synchronize (ctx); + + for (int64_t t_pause_ms = 200; t_pause_ms <= 1800; t_pause_ms += 200) { + double t_sum_us = 0.0; + double t_sum2_us = 0.0; + + for (int i = 0; i < n_iters; i++) { + // this pause is important - it simulates "idle GPU" + std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms)); + + const int64_t t_start_us = llama_time_us(); + + // this should take constant time + llama_decode(ctx, batch); + llama_synchronize(ctx); + + const int64_t t_end_us = llama_time_us(); + + const double t_cur_us = t_end_us - t_start_us; + +#if 0 + // print individual decode times + printf(" - decode time: %8.2f ms\n", t_cur_us / 1000); +#endif + + t_sum_us += t_cur_us; + t_sum2_us += t_cur_us * t_cur_us; + + llama_kv_cache_clear (ctx); + llama_kv_cache_update(ctx); + llama_synchronize (ctx); // just in case + } + + const double t_avg_us = t_sum_us / n_iters; + const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1)); + + printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000); + fflush(stdout); + } + + llama_free(ctx); + llama_model_free(model); + + return 0; +}