2023-03-10 19:40:58 +01:00
|
|
|
// Various helper functions and utilities
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2023-03-22 06:32:36 +01:00
|
|
|
#include "llama.h"
|
|
|
|
|
2023-03-10 19:40:58 +01:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <random>
|
|
|
|
#include <thread>
|
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
|
|
|
#include <unordered_map>
|
2023-06-24 10:47:58 +02:00
|
|
|
#include <tuple>
|
2023-03-10 19:40:58 +01:00
|
|
|
|
|
|
|
//
|
|
|
|
// CLI argument parsing
|
|
|
|
//
|
2023-04-30 20:41:35 +02:00
|
|
|
int32_t get_num_physical_cores();
|
2023-03-10 19:40:58 +01:00
|
|
|
|
|
|
|
struct gpt_params {
|
2023-07-24 17:57:12 +02:00
|
|
|
uint32_t seed = -1; // RNG seed
|
2023-06-14 19:47:19 +02:00
|
|
|
int32_t n_threads = get_num_physical_cores();
|
2023-07-24 17:57:12 +02:00
|
|
|
int32_t n_predict = -1; // new tokens to predict
|
|
|
|
int32_t n_ctx = 512; // context size
|
|
|
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
|
|
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
|
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
|
|
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
|
|
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
|
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
|
|
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
2023-08-25 17:18:48 +02:00
|
|
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
llama : add custom RoPE (#2054)
* Implement customizable RoPE
The original RoPE has pre-defined parameters
theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]
Our customizable RoPE, ggml_rope_custom_inplace, uses
theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]
with the default matches the original
scale = 1.0
base = 10000
The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.
Recent researches show changing these two parameters extends the context limit with minimal loss.
1. Extending Context to 8K
kaiokendev
https://kaiokendev.github.io/til#extending-context-to-8k
2. Extending Context Window of Large Language Models via Positional Interpolation
Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
https://arxiv.org/abs/2306.15595
3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
https://www.reddit.com/user/bloc97
https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
* ggml-metal: fix custom rope
* common: fix argument names in help
* llama: increase MEM_REQ_EVAL for MODEL_3B
It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.
* llama: make MEM_REQ_EVAL depend on n_ctx
* server: use proper Content-Type in curl examples
Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded
Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
With Content-Type: application/json, we can send large json data.
* style : minor fixes, mostly indentations
* ggml : fix asserts
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-15 12:34:16 +02:00
|
|
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
|
|
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
2023-03-17 20:46:46 +01:00
|
|
|
|
2023-03-10 19:40:58 +01:00
|
|
|
// sampling parameters
|
2023-04-29 08:51:06 +02:00
|
|
|
int32_t top_k = 40; // <= 0 to use vocab size
|
|
|
|
float top_p = 0.95f; // 1.0 = disabled
|
|
|
|
float tfs_z = 1.00f; // 1.0 = disabled
|
|
|
|
float typical_p = 1.00f; // 1.0 = disabled
|
|
|
|
float temp = 0.80f; // 1.0 = disabled
|
|
|
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
|
|
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
|
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
|
|
|
float presence_penalty = 0.00f; // 0.0 = disabled
|
2023-07-23 14:09:47 +02:00
|
|
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
2023-04-29 08:51:06 +02:00
|
|
|
float mirostat_tau = 5.00f; // target entropy
|
|
|
|
float mirostat_eta = 0.10f; // learning rate
|
2023-03-10 19:40:58 +01:00
|
|
|
|
2023-08-21 22:07:43 +02:00
|
|
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
|
|
|
|
2023-07-11 18:18:43 +02:00
|
|
|
// Classifier-Free Guidance
|
|
|
|
// https://arxiv.org/abs/2306.17806
|
|
|
|
std::string cfg_negative_prompt; // string to help guidance
|
|
|
|
float cfg_scale = 1.f; // How strong is guidance
|
|
|
|
|
2023-08-21 22:07:43 +02:00
|
|
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
2023-05-28 19:14:24 +02:00
|
|
|
std::string model_alias = "unknown"; // model alias
|
2023-05-19 19:14:51 +02:00
|
|
|
std::string prompt = "";
|
2023-05-10 17:37:14 +02:00
|
|
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
|
|
|
std::string input_prefix = ""; // string to prefix user inputs with
|
|
|
|
std::string input_suffix = ""; // string to suffix user inputs with
|
2023-07-24 05:58:10 +02:00
|
|
|
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
|
2023-03-21 16:32:14 +01:00
|
|
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
2023-03-12 22:13:28 +01:00
|
|
|
|
2023-04-17 17:28:55 +02:00
|
|
|
std::string lora_adapter = ""; // lora adapter path
|
2023-05-19 19:14:51 +02:00
|
|
|
std::string lora_base = ""; // base model path for the lora adapter
|
2023-04-17 17:28:55 +02:00
|
|
|
|
2023-08-23 11:56:42 +02:00
|
|
|
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
|
|
|
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
|
|
|
// (which is more convenient to use for plotting)
|
|
|
|
//
|
2023-07-28 20:25:36 +02:00
|
|
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
|
|
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
|
|
|
|
|
|
|
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
2023-08-22 22:47:05 +02:00
|
|
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
2023-03-24 22:17:37 +01:00
|
|
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
2023-03-21 16:32:14 +01:00
|
|
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
|
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
|
|
bool interactive = false; // interactive mode
|
2023-05-10 17:37:14 +02:00
|
|
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
2023-06-07 04:10:17 +02:00
|
|
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
2023-03-24 16:05:13 +01:00
|
|
|
|
|
|
|
bool embedding = false; // get only sentence embedding
|
2023-04-24 17:45:32 +02:00
|
|
|
bool interactive_first = false; // wait for user input immediately
|
2023-05-09 04:45:48 +02:00
|
|
|
bool multiline_input = false; // reverse the usage of `\`
|
2023-08-04 17:20:12 +02:00
|
|
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
2023-03-24 16:05:13 +01:00
|
|
|
|
2023-07-25 14:19:11 +02:00
|
|
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
2023-08-21 22:07:43 +02:00
|
|
|
bool ignore_eos = false; // ignore generated EOS tokens
|
2023-03-21 16:32:14 +01:00
|
|
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
|
|
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
2023-03-21 17:27:42 +01:00
|
|
|
bool perplexity = false; // compute perplexity over the prompt
|
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
|
|
|
bool use_mmap = true; // use mmap for faster loads
|
2023-03-24 16:19:05 +01:00
|
|
|
bool use_mlock = false; // use mlock to keep model in memory
|
2023-03-24 22:17:37 +01:00
|
|
|
bool mem_test = false; // compute maximum memory usage
|
2023-06-26 19:57:59 +02:00
|
|
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
2023-06-04 22:34:30 +02:00
|
|
|
bool export_cgraph = false; // export the computation graph
|
2023-03-25 16:16:50 +01:00
|
|
|
bool verbose_prompt = false; // print prompt tokens before generation
|
2023-03-10 19:40:58 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
|
|
|
2023-04-14 21:58:43 +02:00
|
|
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
2023-03-10 19:40:58 +01:00
|
|
|
|
|
|
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
|
|
|
|
|
|
|
//
|
2023-08-21 22:07:43 +02:00
|
|
|
// Model utils
|
2023-03-10 19:40:58 +01:00
|
|
|
//
|
|
|
|
|
2023-08-21 22:07:43 +02:00
|
|
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
|
|
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
2023-03-28 16:09:55 +02:00
|
|
|
|
2023-05-02 22:39:51 +02:00
|
|
|
//
|
2023-08-21 22:07:43 +02:00
|
|
|
// Vocab utils
|
2023-05-02 22:39:51 +02:00
|
|
|
//
|
|
|
|
|
2023-08-27 13:19:19 +02:00
|
|
|
// tokenizes a string into a vector of tokens
|
|
|
|
// should work similar to Python's `tokenizer.encode`
|
2023-08-21 22:07:43 +02:00
|
|
|
std::vector<llama_token> llama_tokenize(
|
|
|
|
struct llama_context * ctx,
|
|
|
|
const std::string & text,
|
|
|
|
bool add_bos);
|
|
|
|
|
2023-08-27 13:19:19 +02:00
|
|
|
// tokenizes a token into a piece
|
|
|
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
|
|
std::string llama_token_to_piece(
|
2023-08-21 22:07:43 +02:00
|
|
|
const struct llama_context * ctx,
|
|
|
|
llama_token token);
|
2023-08-27 13:19:19 +02:00
|
|
|
|
|
|
|
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
|
|
|
// that takes into account the tokenizer type and decides how to handle the leading space
|
|
|
|
//
|
|
|
|
// detokenizes a vector of tokens into a string
|
|
|
|
// should work similar to Python's `tokenizer.decode`
|
|
|
|
// removes the leading space from the first non-BOS token
|
|
|
|
std::string llama_detokenize_spm(
|
|
|
|
llama_context * ctx,
|
|
|
|
const std::vector<llama_token> & tokens);
|
|
|
|
|
|
|
|
// detokenizes a vector of tokens into a string
|
|
|
|
// should work similar to Python's `tokenizer.decode`
|
|
|
|
std::string llama_detokenize_bpe(
|
|
|
|
llama_context * ctx,
|
|
|
|
const std::vector<llama_token> & tokens);
|