batched-bench : add --output-format jsonl option (#9293)

`--output-format` is modeled after `llama-bench`'s options
2024-12-27 06:39:25 +01:00 · 2024-09-06 18:59:58 +03:00 · 2024-09-06 18:59:58 +03:00 · 815b1fb20a
commit 815b1fb20a
parent 409dc4f8bb
4 changed files with 40 additions and 7 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1678,6 +1678,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
    if (arg == "--output-format") {
        CHECK_ARG
        std::string value(argv[i]);
        /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
        else if (value == "md") { params.batched_bench_output_jsonl = false; }
        else { invalid_param = true; }
        return true;
    }
    if (arg == "--no-warmup") {
        params.warmup = false;
        return true;
@ -2068,6 +2076,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
    options.push_back({ "batched-bench" });
    options.push_back({ "batched-bench", "       --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
    printf("usage: %s [options]\n", argv[0]);
    for (const auto & o : options) {
--- a/common/common.h
+++ b/common/common.h
@ -275,6 +275,9 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
    // batched-bench params
    bool batched_bench_output_jsonl = false;
 };
 void gpt_params_parse_from_env(gpt_params & params);
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -49,3 +49,12 @@ There are 2 modes of operation:
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
 ### JSONL output
 Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
 ```json lines
 {"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
 {"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
 ```
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -122,12 +122,13 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.batched_bench_output_jsonl) {
        LOG_TEE("\n");
        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
        LOG_TEE("\n");
        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
    }
    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@ -195,10 +196,19 @@ int main(int argc, char ** argv) {
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;
                if(params.batched_bench_output_jsonl) {
                    LOG_TEE(
                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
                    );
                } else {
                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
                }
            }
        }
    }
    llama_print_timings(ctx);