diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4ac19ca86..3932e5a92 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,6 +157,7 @@ static std::string pair_str(const std::pair & p) { struct cmd_params { std::vector model; + std::vector lora; std::vector n_prompt; std::vector n_gen; std::vector> n_pg; @@ -189,6 +190,7 @@ struct cmd_params { static const cmd_params cmd_params_defaults = { /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* lora */ { "none" }, /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, @@ -225,6 +227,7 @@ static void print_usage(int /* argc */, char ** argv) { printf("options:\n"); printf(" -h, --help\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" --lora (default: %s)\n", join(cmd_params_defaults.lora, ",").c_str()); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); @@ -341,6 +344,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.model.insert(params.model.end(), p.begin(), p.end()); + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.lora.insert(params.lora.end(), p.begin(), p.end()); } else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; @@ -606,6 +616,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.model.empty()) { params.model = cmd_params_defaults.model; } + if (params.lora.empty()) { + params.lora = cmd_params_defaults.lora; + } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } @@ -672,6 +685,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { struct cmd_params_instance { std::string model; + std::string lora; int n_prompt; int n_gen; int n_batch; @@ -737,7 +751,7 @@ struct cmd_params_instance { } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && + return model == other.model && lora == other.lora && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } @@ -764,6 +778,7 @@ static std::vector get_cmd_params_instances(const cmd_param // this ordering minimizes the number of times that each model needs to be reloaded // clang-format off for (const auto & m : params.model) + for (const auto & l : params.lora) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) @@ -787,6 +802,7 @@ static std::vector get_cmd_params_instances(const cmd_param } cmd_params_instance instance = { /* .model = */ m, + /* .lora = */ l, /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, /* .n_batch = */ nb, @@ -816,6 +832,7 @@ static std::vector get_cmd_params_instances(const cmd_param } cmd_params_instance instance = { /* .model = */ m, + /* .lora = */ l, /* .n_prompt = */ 0, /* .n_gen = */ n_gen, /* .n_batch = */ nb, @@ -845,6 +862,7 @@ static std::vector get_cmd_params_instances(const cmd_param } cmd_params_instance instance = { /* .model = */ m, + /* .lora = */ l, /* .n_prompt = */ n_pg.first, /* .n_gen = */ n_pg.second, /* .n_batch = */ nb, @@ -879,6 +897,7 @@ struct test { static const std::string cpu_info; static const std::string gpu_info; std::string model_filename; + std::string lora_filename; std::string model_type; uint64_t model_size; uint64_t model_n_params; @@ -905,6 +924,7 @@ struct test { test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; + lora_filename = inst.lora; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); model_type = buf; @@ -966,12 +986,12 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "lora_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", + "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", + "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", + "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", + "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1017,6 +1037,7 @@ struct test { gpu_info, get_backend(), model_filename, + lora_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), @@ -1259,6 +1280,9 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print fields.emplace_back("model"); + if (params.lora.size() > 1 || (!params.lora.empty() && params.lora[0] != "none")) { + fields.emplace_back("lora"); + } fields.emplace_back("size"); fields.emplace_back("params"); fields.emplace_back("backend"); @@ -1337,6 +1361,8 @@ struct markdown_printer : public printer { char buf[128]; if (field == "model") { value = t.model_type; + } else if (field == "lora") { + value = t.lora_filename.empty() || t.lora_filename == "none" ? "N" : "Y"; } else if (field == "size") { if (t.model_size < 1024 * 1024 * 1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); @@ -1561,6 +1587,9 @@ int main(int argc, char ** argv) { } lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); + if (!inst.lora.empty() && inst.lora != "none") { + llama_adapter_lora_init(lmodel, inst.lora.c_str()); + } if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1;