llama.cpp/examples/common.h

// Various helper functions and utilities

#pragma once

#include "llama.h"

#include <string>
#include <vector>
#include <random>
#include <thread>

//
// CLI argument parsing
//

struct gpt_params {
    int32_t seed          = -1;   // RNG seed
    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict     = 128;  // new tokens to predict
    int32_t repeat_last_n = 64;   // last n tokens to penalize
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512;  // context size
    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.95f;
    float   temp  = 0.80f;
    float   repeat_penalty  = 1.10f;

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base = "";     // base model path for the lora adapter

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode

    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately

    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_random_prompt(std::mt19937 & rng);

//
// Vocab utils
//

std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);

//
// Console utils
//

#define ANSI_COLOR_RED     "\x1b[31m"
#define ANSI_COLOR_GREEN   "\x1b[32m"
#define ANSI_COLOR_YELLOW  "\x1b[33m"
#define ANSI_COLOR_BLUE    "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN    "\x1b[36m"
#define ANSI_COLOR_RESET   "\x1b[0m"
#define ANSI_BOLD          "\x1b[1m"

enum console_color_t {
    CONSOLE_COLOR_DEFAULT=0,
    CONSOLE_COLOR_PROMPT,
    CONSOLE_COLOR_USER_INPUT
};

struct console_state {
    bool use_color = false;
    console_color_t color = CONSOLE_COLOR_DEFAULT;
};

void set_console_color(console_state & con_st, console_color_t color);

#if defined (_WIN32)
void win32_console_init(bool enable_color);
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
#endif
Initial release 2023-03-10 19:40:58 +01:00			`// Various helper functions and utilities`

			`#pragma once`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`#include "llama.h"`

Initial release 2023-03-10 19:40:58 +01:00			`#include <string>`
			`#include <vector>`
			`#include <random>`
			`#include <thread>`

			`//`
			`// CLI argument parsing`
			`//`

			`struct gpt_params {`
Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`int32_t seed = -1; // RNG seed`
			`int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());`
			`int32_t n_predict = 128; // new tokens to predict`
			`int32_t repeat_last_n = 64; // last n tokens to penalize`
			`int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)`
			`int32_t n_ctx = 512; // context size`
llama : have n_batch default to 512 (#1091) * set default n_batch to 512 when using BLAS * spacing * alternate implementation of setting different n_batch for BLAS * set n_batch to 512 for all cases 2023-04-22 10:27:05 +02:00			`int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)`
Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`int32_t n_keep = 0; // number of tokens to keep from initial prompt`
Default to 4 threads (#243) 2023-03-17 20:46:46 +01:00
Initial release 2023-03-10 19:40:58 +01:00			`// sampling parameters`
Add back top_k (#56) * Add back top_k * Update utils.cpp * Update utils.h --------- Co-authored-by: Bill Hamilton <bill.hamilton@shopify.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-12 21:23:15 +01:00			`int32_t top_k = 40;`
Initial release 2023-03-10 19:40:58 +01:00			`float top_p = 0.95f;`
Final touches 2023-03-10 20:50:46 +01:00			`float temp = 0.80f;`
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`float repeat_penalty = 1.10f;`
Initial release 2023-03-10 19:40:58 +01:00
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`std::string model = "models/lamma-7B/ggml-model.bin"; // model path`
			`std::string prompt = "";`
Add LoRA support (#820) 2023-04-17 17:28:55 +02:00			`std::string input_prefix = ""; // string to prefix user inputs with`
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted`
Add interactive mode (#61) * Initial work on interactive mode. * Improve interactive mode. Make rev. prompt optional. * Update README to explain interactive mode. * Fix OS X build 2023-03-12 22:13:28 +01:00
Add LoRA support (#820) 2023-04-17 17:28:55 +02:00			`std::string lora_adapter = ""; // lora adapter path`
			`std::string lora_base = ""; // base model path for the lora adapter`

Reduce memory usage and allocate enough memory for largest context (#473) * Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32 2023-03-24 22:17:37 +01:00			`bool memory_f16 = true; // use f16 instead of f32 for memory kv`
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`bool random_prompt = false; // do not randomize prompt if none provided`
			`bool use_color = false; // use color to distinguish generations and inputs`
			`bool interactive = false; // interactive mode`
Add embedding mode with arg flag. Currently working (#282) * working but ugly * add arg flag, not working on embedding mode * typo * Working! Thanks to @nullhook * make params argument instead of hardcoded boolean. remove useless time check * start doing the instructions but not finished. This probably doesnt compile * Embeddings extraction support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:05:13 +01:00
			`bool embedding = false; // get only sentence embedding`
examples/main README improvements and some light refactoring (#1131) 2023-04-24 17:45:32 +02:00			`bool interactive_first = false; // wait for user input immediately`
Add embedding mode with arg flag. Currently working (#282) * working but ugly * add arg flag, not working on embedding mode * typo * Working! Thanks to @nullhook * make params argument instead of hardcoded boolean. remove useless time check * start doing the instructions but not finished. This probably doesnt compile * Embeddings extraction support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:05:13 +01:00
Change default repeat_penalty to 1.0 I feel this penalty is not really helping. Especially for the example from the README it makes results pretty bad 2023-03-21 16:32:14 +01:00			`bool instruct = false; // instruction mode (used for Alpaca models)`
			`bool ignore_eos = false; // do not stop generating after eos`
Compute perplexity over prompt (#270) * Compute perplexity over prompt * More accurate perplexity calculation - over all logits in the context window (so 512x more tokens!) * Output all perplexitiies * Add timing/ETA 2023-03-21 17:27:42 +01:00			`bool perplexity = false; // compute perplexity over the prompt`
Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740) 2023-04-08 21:24:37 +02:00			`bool use_mmap = true; // use mmap for faster loads`
Support calling mlock() on loaded model data on Linux and macOS (#453) * Support calling mlock() on loaded model data on Linux and macOS This is enabled by a new --mlock command line option. Using mlock() disables swapping and memory compression for the model data. Doing so can be useful on systems where the model takes up a large fraction of system RAM. In my experience, macOS is quite eager to start compressing llama.cpp's memory, which then makes it halt for a few seconds while it decompresses, even with a model that uses "only" 25GB out of 32GB. Of course, this comes at the cost of forcing the system to swap or compress other processes' memory instead, so it needs to be used with care and shouldn't be enabled by default. In theory it should be possible to support this on Windows as well using VirtualLock(), but I'm not much of a Windows user. * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 16:19:05 +01:00			`bool use_mlock = false; // use mlock to keep model in memory`
Reduce memory usage and allocate enough memory for largest context (#473) * Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32 2023-03-24 22:17:37 +01:00			`bool mem_test = false; // compute maximum memory usage`
Disable prompt verbosity by default and add option to enable (#480) 2023-03-25 16:16:50 +01:00			`bool verbose_prompt = false; // print prompt tokens before generation`
Initial release 2023-03-10 19:40:58 +01:00			`};`

			`bool gpt_params_parse(int argc, char ** argv, gpt_params & params);`

Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. 2023-04-14 21:58:43 +02:00			`void gpt_print_usage(int argc, char ** argv, const gpt_params & params);`
Initial release 2023-03-10 19:40:58 +01:00
			`std::string gpt_random_prompt(std::mt19937 & rng);`

			`//`
			`// Vocab utils`
			`//`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);`
main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common 2023-03-28 16:09:55 +02:00
			`//`
			`// Console utils`
			`//`

			`#define ANSI_COLOR_RED "\x1b[31m"`
			`#define ANSI_COLOR_GREEN "\x1b[32m"`
			`#define ANSI_COLOR_YELLOW "\x1b[33m"`
			`#define ANSI_COLOR_BLUE "\x1b[34m"`
			`#define ANSI_COLOR_MAGENTA "\x1b[35m"`
			`#define ANSI_COLOR_CYAN "\x1b[36m"`
			`#define ANSI_COLOR_RESET "\x1b[0m"`
			`#define ANSI_BOLD "\x1b[1m"`

			`enum console_color_t {`
			`CONSOLE_COLOR_DEFAULT=0,`
			`CONSOLE_COLOR_PROMPT,`
			`CONSOLE_COLOR_USER_INPUT`
			`};`

			`struct console_state {`
			`bool use_color = false;`
			`console_color_t color = CONSOLE_COLOR_DEFAULT;`
			`};`

			`void set_console_color(console_state & con_st, console_color_t color);`

			`#if defined (_WIN32)`
			`void win32_console_init(bool enable_color);`
fix for windows utf-8 input (#840) Use UTF-16 as input on Windows, since UTF-8 does not work and reads multibyte characters as zeros 2023-04-08 17:49:39 +02:00			`void win32_utf8_encode(const std::wstring & wstr, std::string & str);`
main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common 2023-03-28 16:09:55 +02:00			`#endif`