2023-03-10 19:40:58 +01:00
|
|
|
// Various helper functions and utilities
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2023-03-22 06:32:36 +01:00
|
|
|
#include "llama.h"
|
|
|
|
|
2023-03-10 19:40:58 +01:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <random>
|
|
|
|
#include <thread>
|
|
|
|
|
|
|
|
//
|
|
|
|
// CLI argument parsing
|
|
|
|
//
|
|
|
|
|
|
|
|
struct gpt_params {
|
2023-04-14 21:58:43 +02:00
|
|
|
int32_t seed = -1; // RNG seed
|
|
|
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
|
|
int32_t n_predict = 128; // new tokens to predict
|
|
|
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
|
|
|
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
|
|
|
int32_t n_ctx = 512; // context size
|
2023-04-22 10:27:05 +02:00
|
|
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
2023-04-14 21:58:43 +02:00
|
|
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
2023-03-17 20:46:46 +01:00
|
|
|
|
2023-03-10 19:40:58 +01:00
|
|
|
// sampling parameters
|
2023-03-12 21:23:15 +01:00
|
|
|
int32_t top_k = 40;
|
2023-03-10 19:40:58 +01:00
|
|
|
float top_p = 0.95f;
|
2023-03-10 20:50:46 +01:00
|
|
|
float temp = 0.80f;
|
2023-03-21 16:32:14 +01:00
|
|
|
float repeat_penalty = 1.10f;
|
2023-03-10 19:40:58 +01:00
|
|
|
|
2023-03-21 16:32:14 +01:00
|
|
|
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
|
|
|
std::string prompt = "";
|
2023-04-17 17:28:55 +02:00
|
|
|
std::string input_prefix = ""; // string to prefix user inputs with
|
2023-03-21 16:32:14 +01:00
|
|
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
2023-03-12 22:13:28 +01:00
|
|
|
|
2023-04-17 17:28:55 +02:00
|
|
|
std::string lora_adapter = ""; // lora adapter path
|
|
|
|
std::string lora_base = ""; // base model path for the lora adapter
|
|
|
|
|
2023-03-24 22:17:37 +01:00
|
|
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
2023-03-21 16:32:14 +01:00
|
|
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
|
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
|
|
bool interactive = false; // interactive mode
|
2023-03-24 16:05:13 +01:00
|
|
|
|
|
|
|
bool embedding = false; // get only sentence embedding
|
2023-04-24 17:45:32 +02:00
|
|
|
bool interactive_first = false; // wait for user input immediately
|
2023-03-24 16:05:13 +01:00
|
|
|
|
2023-03-21 16:32:14 +01:00
|
|
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
|
|
bool ignore_eos = false; // do not stop generating after eos
|
2023-03-21 17:27:42 +01:00
|
|
|
bool perplexity = false; // compute perplexity over the prompt
|
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
|
|
|
bool use_mmap = true; // use mmap for faster loads
|
2023-03-24 16:19:05 +01:00
|
|
|
bool use_mlock = false; // use mlock to keep model in memory
|
2023-03-24 22:17:37 +01:00
|
|
|
bool mem_test = false; // compute maximum memory usage
|
2023-03-25 16:16:50 +01:00
|
|
|
bool verbose_prompt = false; // print prompt tokens before generation
|
2023-03-10 19:40:58 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
|
|
|
2023-04-14 21:58:43 +02:00
|
|
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
2023-03-10 19:40:58 +01:00
|
|
|
|
|
|
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
|
|
|
|
|
|
|
//
|
|
|
|
// Vocab utils
|
|
|
|
//
|
|
|
|
|
2023-03-22 06:32:36 +01:00
|
|
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
2023-03-28 16:09:55 +02:00
|
|
|
|
|
|
|
//
|
|
|
|
// Console utils
|
|
|
|
//
|
|
|
|
|
|
|
|
#define ANSI_COLOR_RED "\x1b[31m"
|
|
|
|
#define ANSI_COLOR_GREEN "\x1b[32m"
|
|
|
|
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
|
|
|
#define ANSI_COLOR_BLUE "\x1b[34m"
|
|
|
|
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
|
|
|
#define ANSI_COLOR_CYAN "\x1b[36m"
|
|
|
|
#define ANSI_COLOR_RESET "\x1b[0m"
|
|
|
|
#define ANSI_BOLD "\x1b[1m"
|
|
|
|
|
|
|
|
enum console_color_t {
|
|
|
|
CONSOLE_COLOR_DEFAULT=0,
|
|
|
|
CONSOLE_COLOR_PROMPT,
|
|
|
|
CONSOLE_COLOR_USER_INPUT
|
|
|
|
};
|
|
|
|
|
|
|
|
struct console_state {
|
|
|
|
bool use_color = false;
|
|
|
|
console_color_t color = CONSOLE_COLOR_DEFAULT;
|
|
|
|
};
|
|
|
|
|
|
|
|
void set_console_color(console_state & con_st, console_color_t color);
|
|
|
|
|
|
|
|
#if defined (_WIN32)
|
|
|
|
void win32_console_init(bool enable_color);
|
2023-04-08 17:49:39 +02:00
|
|
|
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
|
2023-03-28 16:09:55 +02:00
|
|
|
#endif
|