diff --git a/common/arg.cpp b/common/arg.cpp index 7c5c5e5cd..4115b2f75 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1939,17 +1939,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"-ld", "--logdir"}, "LOGDIR", - "path under which to save YAML logs (no logging if unset)", - [](common_params & params, const std::string & value) { - params.logdir = value; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - )); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), diff --git a/common/common.cpp b/common/common.cpp index ebd16b600..930374621 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1890,213 +1890,3 @@ common_control_vector_data common_control_vector_load(const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%e, ", data[i]); - } - fprintf(stream, "%e]\n", data.back()); -} - -void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%d, ", data[i]); - } - fprintf(stream, "%d]\n", data.back()); -} - -void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { - std::string data_str(data == NULL ? "" : data); - - if (data_str.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - size_t pos_start = 0; - size_t pos_found = 0; - - if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { - data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); - data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); - data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); - data_str = "\"" + data_str + "\""; - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - if (data_str.find('\n') == std::string::npos) { - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - fprintf(stream, "%s: |\n", prop_name); - while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { - fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); - pos_start = pos_found + 1; - } -} - -void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - ggml_cpu_init(); // some ARM features are detected at runtime - - const auto & sparams = params.sparams; - - fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); - fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); - fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); - -#ifdef NDEBUG - fprintf(stream, "debug: false\n"); -#else - fprintf(stream, "debug: true\n"); -#endif // NDEBUG - - fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); - -#ifdef __OPTIMIZE__ - fprintf(stream, "optimize: true\n"); -#else - fprintf(stream, "optimize: false\n"); -#endif // __OPTIMIZE__ - - fprintf(stream, "time: %s\n", timestamp.c_str()); - - fprintf(stream, "\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "# User Inputs #\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "\n"); - - fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); - fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); - fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); - fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); - fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length); - fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base); - fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier); - fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n); - fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); - yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); - fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); - fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false"); - - yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); - fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); - yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); - fprintf(stream, "keep: %d # default: 0\n", params.n_keep); - fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); - - fprintf(stream, "logit_bias:\n"); - for (const auto & logit_bias : sparams.logit_bias) { - fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias); - } - - fprintf(stream, "lora:\n"); - for (auto & la : params.lora_adapters) { - if (la.scale == 1.0f) { - fprintf(stream, " - %s\n", la.path.c_str()); - } - } - fprintf(stream, "lora_scaled:\n"); - for (auto & la : params.lora_adapters) { - if (la.scale != 1.0f) { - fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); - } - } - fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); - fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); - fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); - fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); - fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); - fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); - fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); - fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); - fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); - fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); - fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); - fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); - fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); - yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); - fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); - fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); - fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); - yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); - - fprintf(stream, "reverse_prompt:\n"); - for (std::string ap : params.antiprompt) { - size_t pos = 0; - while ((pos = ap.find('\n', pos)) != std::string::npos) { - ap.replace(pos, 1, "\\n"); - pos += 1; - } - - fprintf(stream, " - %s\n", ap.c_str()); - } - - fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); - fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); - fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); - fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); - fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); - yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); - - fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); - fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); - fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); - fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); - fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); - fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); - fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); - fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); - fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); -} diff --git a/common/common.h b/common/common.h index 6289feaeb..7977cc7a9 100644 --- a/common/common.h +++ b/common/common.h @@ -209,7 +209,6 @@ struct common_params { std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT std::string input_suffix = ""; // string to suffix user inputs with // NOLINT - std::string logdir = ""; // directory in which to save YAML log files // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT @@ -584,15 +583,3 @@ common_control_vector_data common_control_vector_load(const std::vector & data); -void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data); -void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); - -void yaml_dump_non_result_info( - FILE * stream, const common_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index f18362c91..15b358dc4 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -43,50 +43,6 @@ static std::vector * g_output_tokens; static bool is_interacting = false; -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: infill\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -96,7 +52,6 @@ static void sigint_handler(int signo) { console::cleanup(); LOG("\n"); common_perf_print(*g_ctx, *g_smpl); - write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); @@ -625,7 +580,6 @@ int main(int argc, char ** argv) { LOG("\n"); common_perf_print(ctx, smpl); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 374ed47ad..7c4ce4be2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -115,7 +72,6 @@ static void sigint_handler(int signo) { console::cleanup(); LOG("\n"); common_perf_print(*g_ctx, *g_smpl); - write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); @@ -926,7 +882,6 @@ int main(int argc, char ** argv) { LOG("\n\n"); common_perf_print(ctx, smpl); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); common_sampler_free(smpl); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index e803ff143..64a84607c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -34,55 +34,6 @@ struct results_log_softmax { float prob; }; -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const struct results_perplexity & results -) { - if (params.logdir.empty()) { - return; - } - - if (params.hellaswag) { - LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Perplexity Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_vector_float(logfile, "logits", results.logits); - fprintf(logfile, "ppl_value: %f\n", results.ppl_value); - yaml_dump_vector_float(logfile, "probs", results.probs); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; @@ -2072,8 +2023,6 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); - write_logfile(ctx, params, model, results); - llama_free(ctx); llama_free_model(model); diff --git a/examples/server/README.md b/examples/server/README.md index 6f72c6bb8..0936e0b7b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -85,7 +85,6 @@ The project is under active development, and we are [looking for feedback and co | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | -| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | | `--log-disable` | Log disable | | `--log-file FNAME` | Log to file | | `--log-colors` | Enable colored logging
(env: LLAMA_LOG_COLORS) | diff --git a/include/llama.h b/include/llama.h index 5e742642e..bc268e799 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1244,8 +1244,6 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); - LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); - #ifdef __cplusplus } #endif diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py deleted file mode 100755 index 8f0bf8ca8..000000000 --- a/scripts/run-with-preset.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import argparse -import os -import subprocess -import sys - -import yaml - -logger = logging.getLogger("run-with-preset") - -CLI_ARGS_LLAMA_CLI_PERPLEXITY = [ - "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", - "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", - "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", - "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", - "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", - "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", - "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", - "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n", - "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", - "simple-io", "tensor-split", "threads", "temp", "top-k", "top-p", "typical", - "verbose-prompt" -] - -CLI_ARGS_LLAMA_BENCH = [ - "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers", - "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose" -] - -CLI_ARGS_LLAMA_SERVER = [ - "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base", - "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", - "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", - "threads", "verbose" -] - -description = """Run llama.cpp binaries with presets from YAML file(s). -To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported). -To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument. - -Formatting considerations: -- The YAML property names are the same as the CLI argument names of the corresponding binary. -- Properties must use the long name of their corresponding llama.cpp CLI arguments. -- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores. -- Flags must be defined as ": true" to be effective. -- To define the logit_bias property, the expected format is ": " in the "logit_bias" namespace. -- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings. -- To define a tensor split, pass a list of floats. -""" -usage = "run-with-preset.py [-h] [yaml_files ...] [-- ...]" -epilog = (" -- specify additional CLI ars to be passed to the binary (override all preset files). " - "Unknown args will be ignored.") - -parser = argparse.ArgumentParser( - description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter) -parser.add_argument("-bin", "--binary", help="The binary to run.") -parser.add_argument("yaml_files", nargs="*", - help="Arbitrary number of YAML files from which to read preset values. " - "If two files specify the same values the later one will be used.") -parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - -known_args, unknown_args = parser.parse_known_args() - -if not known_args.yaml_files and not unknown_args: - parser.print_help() - sys.exit(0) - -logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) - -props = dict() - -for yaml_file in known_args.yaml_files: - with open(yaml_file, "r") as f: - props.update(yaml.load(f, yaml.SafeLoader)) - -props = {prop.replace("_", "-"): val for prop, val in props.items()} - -binary = props.pop("binary", "llama-cli") -if known_args.binary: - binary = known_args.binary - -if os.path.exists(f"./{binary}"): - binary = f"./{binary}" - -if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"): - cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY -elif binary.lower().endswith("llama-bench"): - cli_args = CLI_ARGS_LLAMA_BENCH -elif binary.lower().endswith("llama-server"): - cli_args = CLI_ARGS_LLAMA_SERVER -else: - logger.error(f"Unknown binary: {binary}") - sys.exit(1) - -command_list = [binary] - -for cli_arg in cli_args: - value = props.pop(cli_arg, None) - - if not value or value == -1: - continue - - if cli_arg == "logit-bias": - for token, bias in value.items(): - command_list.append("--logit-bias") - command_list.append(f"{token}{bias:+}") - continue - - if cli_arg == "reverse-prompt" and not isinstance(value, str): - for rp in value: - command_list.append("--reverse-prompt") - command_list.append(str(rp)) - continue - - command_list.append(f"--{cli_arg}") - - if cli_arg == "tensor-split": - command_list.append(",".join([str(v) for v in value])) - continue - - value = str(value) - - if value != "True": - command_list.append(str(value)) - -num_unused = len(props) -if num_unused > 10: - logger.info(f"The preset file contained a total of {num_unused} unused properties.") -elif num_unused > 0: - logger.info("The preset file contained the following unused properties:") - for prop, value in props.items(): - logger.info(f" {prop}: {value}") - -command_list += unknown_args - -sp = subprocess.Popen(command_list) - -while sp.returncode is None: - try: - sp.wait() - except KeyboardInterrupt: - pass - -sys.exit(sp.returncode) diff --git a/src/llama.cpp b/src/llama.cpp index dc5dfba0c..1703104fb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22075,28 +22075,6 @@ void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } -void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { - fprintf(stream, "\n"); - fprintf(stream, "###########\n"); - fprintf(stream, "# Timings #\n"); - fprintf(stream, "###########\n"); - fprintf(stream, "\n"); - - fprintf(stream, "mst_eval: %.2f # ms / token during generation\n", - 1.0e-3 * ctx->t_eval_us / ctx->n_eval); - fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n", - 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval); - fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval); - fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval); - fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us); - fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us); - fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us); - fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n", - 1.0e6 * ctx->n_eval / ctx->t_eval_us); - fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n", - 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us); -} - // For internal test use const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx