llama.cpp/examples/common.h

// Various helper functions and utilities

#pragma once

#include "llama.h"

#include <string>
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>

//
// CLI argument parsing
//
int32_t get_num_physical_cores();

struct gpt_params {
    int32_t seed          = -1;   // RNG seed
    int32_t n_threads     = get_num_physical_cores();
    int32_t n_predict     = -1;  // new tokens to predict
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512;  // context size
    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
    float   repeat_penalty    = 1.10f; // 1.0 = disabled
    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   frequency_penalty = 0.00f; // 0.0 = disabled
    float   presence_penalty  = 0.00f; // 0.0 = disabled
    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
    std::string path_session = "";       // path to file for saving/loading model eval state
    std::string input_prefix = "";       // string to prefix user inputs with
    std::string input_suffix = "";       // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base = "";     // base model path for the lora adapter

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode

    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately

    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_random_prompt(std::mt19937 & rng);

//
// Vocab utils
//

std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);

//
// Model utils
//

struct llama_context * llama_init_from_gpt_params(const gpt_params & params);

//
// Console utils
//

#define ANSI_COLOR_RED     "\x1b[31m"
#define ANSI_COLOR_GREEN   "\x1b[32m"
#define ANSI_COLOR_YELLOW  "\x1b[33m"
#define ANSI_COLOR_BLUE    "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN    "\x1b[36m"
#define ANSI_COLOR_RESET   "\x1b[0m"
#define ANSI_BOLD          "\x1b[1m"

enum console_color_t {
    CONSOLE_COLOR_DEFAULT=0,
    CONSOLE_COLOR_PROMPT,
    CONSOLE_COLOR_USER_INPUT
};

struct console_state {
    bool use_color = false;
    console_color_t color = CONSOLE_COLOR_DEFAULT;
};

void set_console_color(console_state & con_st, console_color_t color);

#if defined (_WIN32)
void win32_console_init(bool enable_color);
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
#endif
Initial release 2023-03-10 19:40:58 +01:00			`// Various helper functions and utilities`

			`#pragma once`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`#include "llama.h"`

Initial release 2023-03-10 19:40:58 +01:00			`#include <string>`
			`#include <vector>`
			`#include <random>`
			`#include <thread>`
llama : new sampling algorithms (#1126) * Sample interface, new samplers. New samplers: - locally typical sampling - tail free sampling - frequency and presence penalty - mirostat Ignore EOS fix: -inf should be used. * mirostat * Added --logit-bias and --no-penalize-nl, removed std::span * Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) * Save and load example adjust * Tests * Windows build fix * Windows test fix 2023-04-29 07:34:41 +02:00			`#include <unordered_map>`
Initial release 2023-03-10 19:40:58 +01:00
			`//`
			`// CLI argument parsing`
			`//`
common : better default number of threads (#934) * commit * fix * try-catch * apply code review * improve * improve * add macos headers * done * remove color * fix windows * minor * fix * Apply suggestions from code review Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com> * remove * minor * minor --------- Co-authored-by: jon-chuang <jon-chuang@users.noreply.github.com> Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com> 2023-04-30 20:41:35 +02:00			`int32_t get_num_physical_cores();`
Initial release 2023-03-10 19:40:58 +01:00
			`struct gpt_params {`
Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`int32_t seed = -1; // RNG seed`
common : better default number of threads (#934) * commit * fix * try-catch * apply code review * improve * improve * add macos headers * done * remove color * fix windows * minor * fix * Apply suggestions from code review Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com> * remove * minor * minor --------- Co-authored-by: jon-chuang <jon-chuang@users.noreply.github.com> Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com> 2023-04-30 20:41:35 +02:00			`int32_t n_threads = get_num_physical_cores();`
			`int32_t n_predict = -1; // new tokens to predict`
Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)`
			`int32_t n_ctx = 512; // context size`
llama : have n_batch default to 512 (#1091) * set default n_batch to 512 when using BLAS * spacing * alternate implementation of setting different n_batch for BLAS * set n_batch to 512 for all cases 2023-04-22 10:27:05 +02:00			`int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)`
Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`int32_t n_keep = 0; // number of tokens to keep from initial prompt`
Default to 4 threads (#243) 2023-03-17 20:46:46 +01:00
Initial release 2023-03-10 19:40:58 +01:00			`// sampling parameters`
llama : new sampling algorithms (#1126) * Sample interface, new samplers. New samplers: - locally typical sampling - tail free sampling - frequency and presence penalty - mirostat Ignore EOS fix: -inf should be used. * mirostat * Added --logit-bias and --no-penalize-nl, removed std::span * Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) * Save and load example adjust * Tests * Windows build fix * Windows test fix 2023-04-29 07:34:41 +02:00			`std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens`
common : change default parameters to pre-#1126 (#1223) 2023-04-29 08:51:06 +02:00			`int32_t top_k = 40; // <= 0 to use vocab size`
			`float top_p = 0.95f; // 1.0 = disabled`
			`float tfs_z = 1.00f; // 1.0 = disabled`
			`float typical_p = 1.00f; // 1.0 = disabled`
			`float temp = 0.80f; // 1.0 = disabled`
			`float repeat_penalty = 1.10f; // 1.0 = disabled`
			`int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)`
			`float frequency_penalty = 0.00f; // 0.0 = disabled`
			`float presence_penalty = 0.00f; // 0.0 = disabled`
			`int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0`
			`float mirostat_tau = 5.00f; // target entropy`
			`float mirostat_eta = 0.10f; // learning rate`
Initial release 2023-03-10 19:40:58 +01:00
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`std::string model = "models/lamma-7B/ggml-model.bin"; // model path`
			`std::string prompt = "";`
llama : add session file format and saved sessions in main (#1169) 2023-04-28 17:59:37 +02:00			`std::string path_session = ""; // path to file for saving/loading model eval state`
Add LoRA support (#820) 2023-04-17 17:28:55 +02:00			`std::string input_prefix = ""; // string to prefix user inputs with`
main : add --in-suffix option (#1318) * adding --in-suffix option * print input suffix before generation 2023-05-04 17:41:12 +02:00			`std::string input_suffix = ""; // string to suffix user inputs with`
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted`
Add interactive mode (#61) * Initial work on interactive mode. * Improve interactive mode. Make rev. prompt optional. * Update README to explain interactive mode. * Fix OS X build 2023-03-12 22:13:28 +01:00
Add LoRA support (#820) 2023-04-17 17:28:55 +02:00			`std::string lora_adapter = ""; // lora adapter path`
			`std::string lora_base = ""; // base model path for the lora adapter`

Reduce memory usage and allocate enough memory for largest context (#473) * Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32 2023-03-24 22:17:37 +01:00			`bool memory_f16 = true; // use f16 instead of f32 for memory kv`
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`bool random_prompt = false; // do not randomize prompt if none provided`
			`bool use_color = false; // use color to distinguish generations and inputs`
			`bool interactive = false; // interactive mode`
Add embedding mode with arg flag. Currently working (#282) * working but ugly * add arg flag, not working on embedding mode * typo * Working! Thanks to @nullhook * make params argument instead of hardcoded boolean. remove useless time check * start doing the instructions but not finished. This probably doesnt compile * Embeddings extraction support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:05:13 +01:00
			`bool embedding = false; // get only sentence embedding`
examples/main README improvements and some light refactoring (#1131) 2023-04-24 17:45:32 +02:00			`bool interactive_first = false; // wait for user input immediately`
Add embedding mode with arg flag. Currently working (#282) * working but ugly * add arg flag, not working on embedding mode * typo * Working! Thanks to @nullhook * make params argument instead of hardcoded boolean. remove useless time check * start doing the instructions but not finished. This probably doesnt compile * Embeddings extraction support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:05:13 +01:00
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`bool instruct = false; // instruction mode (used for Alpaca models)`
llama : new sampling algorithms (#1126) * Sample interface, new samplers. New samplers: - locally typical sampling - tail free sampling - frequency and presence penalty - mirostat Ignore EOS fix: -inf should be used. * mirostat * Added --logit-bias and --no-penalize-nl, removed std::span * Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and k) * Save and load example adjust * Tests * Windows build fix * Windows test fix 2023-04-29 07:34:41 +02:00			`bool penalize_nl = true; // consider newlines as a repeatable token`
Compute perplexity over prompt (#270) * Compute perplexity over prompt * More accurate perplexity calculation - over all logits in the context window (so 512x more tokens!) * Output all perplexitiies * Add timing/ETA 2023-03-21 17:27:42 +01:00			`bool perplexity = false; // compute perplexity over the prompt`
Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740) 2023-04-08 21:24:37 +02:00			`bool use_mmap = true; // use mmap for faster loads`
Support calling mlock() on loaded model data on Linux and macOS (#453) * Support calling mlock() on loaded model data on Linux and macOS This is enabled by a new --mlock command line option. Using mlock() disables swapping and memory compression for the model data. Doing so can be useful on systems where the model takes up a large fraction of system RAM. In my experience, macOS is quite eager to start compressing llama.cpp's memory, which then makes it halt for a few seconds while it decompresses, even with a model that uses "only" 25GB out of 32GB. Of course, this comes at the cost of forcing the system to swap or compress other processes' memory instead, so it needs to be used with care and shouldn't be enabled by default. In theory it should be possible to support this on Windows as well using VirtualLock(), but I'm not much of a Windows user. * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:19:05 +01:00			`bool use_mlock = false; // use mlock to keep model in memory`
Reduce memory usage and allocate enough memory for largest context (#473) * Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32 2023-03-24 22:17:37 +01:00			`bool mem_test = false; // compute maximum memory usage`
Disable prompt verbosity by default and add option to enable (#480) 2023-03-25 16:16:50 +01:00			`bool verbose_prompt = false; // print prompt tokens before generation`
Initial release 2023-03-10 19:40:58 +01:00			`};`

			`bool gpt_params_parse(int argc, char ** argv, gpt_params & params);`

Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`void gpt_print_usage(int argc, char ** argv, const gpt_params & params);`
Initial release 2023-03-10 19:40:58 +01:00
			`std::string gpt_random_prompt(std::mt19937 & rng);`

			`//`
			`// Vocab utils`
			`//`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);`
main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common 2023-03-28 16:09:55 +02:00
examples : add llama_init_from_gpt_params() common function (#1290) Signed-off-by: deadprogram <ron@hybridgroup.com> 2023-05-02 22:39:51 +02:00			`//`
			`// Model utils`
			`//`

			`struct llama_context * llama_init_from_gpt_params(const gpt_params & params);`

main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common 2023-03-28 16:09:55 +02:00			`//`
			`// Console utils`
			`//`

			`#define ANSI_COLOR_RED "\x1b[31m"`
			`#define ANSI_COLOR_GREEN "\x1b[32m"`
			`#define ANSI_COLOR_YELLOW "\x1b[33m"`
			`#define ANSI_COLOR_BLUE "\x1b[34m"`
			`#define ANSI_COLOR_MAGENTA "\x1b[35m"`
			`#define ANSI_COLOR_CYAN "\x1b[36m"`
			`#define ANSI_COLOR_RESET "\x1b[0m"`
			`#define ANSI_BOLD "\x1b[1m"`

			`enum console_color_t {`
			`CONSOLE_COLOR_DEFAULT=0,`
			`CONSOLE_COLOR_PROMPT,`
			`CONSOLE_COLOR_USER_INPUT`
			`};`

			`struct console_state {`
			`bool use_color = false;`
			`console_color_t color = CONSOLE_COLOR_DEFAULT;`
			`};`

			`void set_console_color(console_state & con_st, console_color_t color);`

			`#if defined (_WIN32)`
			`void win32_console_init(bool enable_color);`
fix for windows utf-8 input (#840) Use UTF-16 as input on Windows, since UTF-8 does not work and reads multibyte characters as zeros 2023-04-08 17:49:39 +02:00			`void win32_utf8_encode(const std::wstring & wstr, std::string & str);`
main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common 2023-03-28 16:09:55 +02:00			`#endif`