mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-02 17:25:45 +01:00
563cdc391d
* Support calling mlock() on loaded model data on Linux and macOS This is enabled by a new --mlock command line option. Using mlock() disables swapping and memory compression for the model data. Doing so can be useful on systems where the model takes up a large fraction of system RAM. In my experience, macOS is quite eager to start compressing llama.cpp's memory, which then makes it halt for a few seconds while it decompresses, even with a model that uses "only" 25GB out of 32GB. Of course, this comes at the cost of forcing the system to swap or compress other processes' memory instead, so it needs to be used with care and shouldn't be enabled by default. In theory it should be possible to support this on Windows as well using VirtualLock(), but I'm not much of a Windows user. * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
63 lines
2.0 KiB
C++
63 lines
2.0 KiB
C++
// Various helper functions and utilities
|
|
|
|
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <random>
|
|
#include <thread>
|
|
|
|
//
|
|
// CLI argument parsing
|
|
//
|
|
|
|
struct gpt_params {
|
|
int32_t seed = -1; // RNG seed
|
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
int32_t n_predict = 128; // new tokens to predict
|
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
|
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
|
int32_t n_ctx = 512; //context size
|
|
|
|
// sampling parameters
|
|
int32_t top_k = 40;
|
|
float top_p = 0.95f;
|
|
float temp = 0.80f;
|
|
float repeat_penalty = 1.10f;
|
|
|
|
int32_t n_batch = 8; // batch size for prompt processing
|
|
|
|
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
|
std::string prompt = "";
|
|
|
|
|
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
|
|
|
bool memory_f16 = false; // use f16 instead of f32 for memory kv
|
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
bool interactive = false; // interactive mode
|
|
|
|
bool embedding = false; // get only sentence embedding
|
|
bool interactive_start = false; // wait for user input immediately
|
|
|
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
bool ignore_eos = false; // do not stop generating after eos
|
|
bool perplexity = false; // compute perplexity over the prompt
|
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
};
|
|
|
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
|
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
|
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
|
|
|
//
|
|
// Vocab utils
|
|
//
|
|
|
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|