Llama-bench: allow benchmarking lora impact

2025-01-27 04:23:06 +01:00 · 2025-01-25 11:09:26 +01:00 · 2025-01-25 11:09:26 +01:00 · 044d4998ae
commit 044d4998ae
parent 96f4053934
1 changed files with 36 additions and 7 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -157,6 +157,7 @@ static std::string pair_str(const std::pair<int, int> & p) {

 struct cmd_params {
    std::vector<std::string>         model;
+    std::vector<std::string>         lora;
    std::vector<int>                 n_prompt;
    std::vector<int>                 n_gen;
    std::vector<std::pair<int, int>> n_pg;
@ -189,6 +190,7 @@ struct cmd_params {

 static const cmd_params cmd_params_defaults = {
    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
+    /* lora                 */ { "none" },
    /* n_prompt             */ { 512 },
    /* n_gen                */ { 128 },
    /* n_pg                 */ {},
@ -225,6 +227,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("options:\n");
    printf("  -h, --help\n");
    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("      --lora <filename>                     (default: %s)\n", join(cmd_params_defaults.lora, ",").c_str());
    printf("  -p, --n-prompt <n>                        (default: %s)\n",
           join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
@ -341,6 +344,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<std::string>(argv[i], split_delim);
            params.model.insert(params.model.end(), p.begin(), p.end());
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.lora.insert(params.lora.end(), p.begin(), p.end());
        } else if (arg == "-p" || arg == "--n-prompt") {
            if (++i >= argc) {
                invalid_param = true;
@ -606,6 +616,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.model.empty()) {
        params.model = cmd_params_defaults.model;
    }
+    if (params.lora.empty()) {
+        params.lora = cmd_params_defaults.lora;
+    }
    if (params.n_prompt.empty()) {
        params.n_prompt = cmd_params_defaults.n_prompt;
    }
@ -672,6 +685,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

 struct cmd_params_instance {
    std::string        model;
+    std::string        lora;
    int                n_prompt;
    int                n_gen;
    int                n_batch;
@ -737,7 +751,7 @@ struct cmd_params_instance {
    }

    bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
+        return model == other.model && lora == other.lora && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
               tensor_split == other.tensor_split;
    }
@ -764,6 +778,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    // this ordering minimizes the number of times that each model needs to be reloaded
    // clang-format off
    for (const auto & m : params.model)
+    for (const auto & l : params.lora)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & rpc : params.rpc_servers)
    for (const auto & sm : params.split_mode)
@ -787,6 +802,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
            }
            cmd_params_instance instance = {
                /* .model        = */ m,
+                /* .lora         = */ l,
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
@ -816,6 +832,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
            }
            cmd_params_instance instance = {
                /* .model        = */ m,
+                /* .lora         = */ l,
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
@ -845,6 +862,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
            }
            cmd_params_instance instance = {
                /* .model        = */ m,
+                /* .lora         = */ l,
                /* .n_prompt     = */ n_pg.first,
                /* .n_gen        = */ n_pg.second,
                /* .n_batch      = */ nb,
@ -879,6 +897,7 @@ struct test {
    static const std::string cpu_info;
    static const std::string gpu_info;
    std::string              model_filename;
+    std::string              lora_filename;
    std::string              model_type;
    uint64_t                 model_size;
    uint64_t                 model_n_params;
@ -905,6 +924,7 @@ struct test {

    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
+        lora_filename = inst.lora;
        char buf[128];
        llama_model_desc(lmodel, buf, sizeof(buf));
        model_type     = buf;
@ -966,12 +986,12 @@ struct test {

    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
-            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
-            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
-            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
-            "avg_ts",       "stddev_ts",
+            "build_commit",  "build_number", "cpu_info",   "gpu_info",       "backends",   "model_filename",
+            "lora_filename", "model_type",   "model_size", "model_n_params", "n_batch",    "n_ubatch",
+            "n_threads",     "cpu_mask",     "cpu_strict", "poll",           "type_k",     "type_v",
+            "n_gpu_layers",  "split_mode",   "main_gpu",   "no_kv_offload",  "flash_attn", "tensor_split",
+            "use_mmap",      "embeddings",   "n_prompt",   "n_gen",          "test_time",  "avg_ns",
+            "stddev_ns", "avg_ts",       "stddev_ts",
        };
        return fields;
    }
@ -1017,6 +1037,7 @@ struct test {
                                            gpu_info,
                                            get_backend(),
                                            model_filename,
+                                            lora_filename,
                                            model_type,
                                            std::to_string(model_size),
                                            std::to_string(model_n_params),
@ -1259,6 +1280,9 @@ struct markdown_printer : public printer {
    void print_header(const cmd_params & params) override {
        // select fields to print
        fields.emplace_back("model");
+        if (params.lora.size() > 1 || (!params.lora.empty() && params.lora[0] != "none")) {
+            fields.emplace_back("lora");
+        }
        fields.emplace_back("size");
        fields.emplace_back("params");
        fields.emplace_back("backend");
@ -1337,6 +1361,8 @@ struct markdown_printer : public printer {
            char        buf[128];
            if (field == "model") {
                value = t.model_type;
+            } else if (field == "lora") {
+                value = t.lora_filename.empty() || t.lora_filename == "none" ? "N" : "Y";
            } else if (field == "size") {
                if (t.model_size < 1024 * 1024 * 1024) {
                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
@ -1561,6 +1587,9 @@ int main(int argc, char ** argv) {
            }

            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            if (!inst.lora.empty() && inst.lora != "none") {
+                llama_adapter_lora_init(lmodel, inst.lora.c_str());
+            }
            if (lmodel == NULL) {
                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
                return 1;