mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 03:12:23 +01:00
common : move arg parser code to arg.cpp
(#9388)
* common : move arg parser to arg.cpp * better categorize args * add cmake * missing climits * missing cstdarg * common : more explicit includes * fix build * refactor gpt_params_parse * update server readme * fix test --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
293bebe077
commit
bfe76d4a17
6
Makefile
6
Makefile
@ -925,6 +925,7 @@ OBJ_LLAMA = \
|
|||||||
|
|
||||||
OBJ_COMMON = \
|
OBJ_COMMON = \
|
||||||
common/common.o \
|
common/common.o \
|
||||||
|
common/arg.o \
|
||||||
common/console.o \
|
common/console.o \
|
||||||
common/ngram-cache.o \
|
common/ngram-cache.o \
|
||||||
common/sampling.o \
|
common/sampling.o \
|
||||||
@ -1157,6 +1158,11 @@ common/common.o: \
|
|||||||
include/llama.h
|
include/llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
common/arg.o: \
|
||||||
|
common/arg.cpp \
|
||||||
|
common/arg.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/sampling.o: \
|
common/sampling.o: \
|
||||||
common/sampling.cpp \
|
common/sampling.cpp \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
|
@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
|
|||||||
base64.hpp
|
base64.hpp
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
arg.h
|
||||||
|
arg.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
|
1994
common/arg.cpp
Normal file
1994
common/arg.cpp
Normal file
File diff suppressed because it is too large
Load Diff
77
common/arg.h
Normal file
77
common/arg.h
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
//
|
||||||
|
// CLI argument parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_arg {
|
||||||
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::vector<const char *> args;
|
||||||
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
const char * env = nullptr;
|
||||||
|
std::string help;
|
||||||
|
bool is_sparam = false; // is current arg a sampling param?
|
||||||
|
void (*handler_void) (gpt_params & params) = nullptr;
|
||||||
|
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||||
|
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
|
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, int)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params)
|
||||||
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
// support 2 values for arg
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const char * value_hint_2,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
|
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
|
llama_arg & set_env(const char * env);
|
||||||
|
llama_arg & set_sparam();
|
||||||
|
bool in_example(enum llama_example ex);
|
||||||
|
bool get_value_from_env(std::string & output);
|
||||||
|
bool has_value_from_env();
|
||||||
|
std::string to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params_context {
|
||||||
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
|
gpt_params & params;
|
||||||
|
std::vector<llama_arg> options;
|
||||||
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
|
gpt_params_context(gpt_params & params) : params(params) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// parse input arguments from CLI
|
||||||
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
|
// function to be used by test-arg-parser
|
||||||
|
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
1929
common/common.cpp
1929
common/common.cpp
File diff suppressed because it is too large
Load Diff
214
common/common.h
214
common/common.h
@ -4,20 +4,11 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "sampling.h"
|
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
|
||||||
#include <thread>
|
|
||||||
#include <set>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <tuple>
|
|
||||||
#include <functional>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
|
|||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct cpu_params {
|
||||||
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
int32_t cpu_get_num_physical_cores();
|
int32_t cpu_get_num_physical_cores();
|
||||||
int32_t cpu_get_num_math();
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// Common params
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_example {
|
enum llama_example {
|
||||||
@ -78,28 +78,71 @@ enum llama_example {
|
|||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
LLAMA_EXAMPLE_LLAVA,
|
||||||
|
LLAMA_EXAMPLE_LOOKUP,
|
||||||
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum gpt_sampler_type {
|
||||||
|
GPT_SAMPLER_TYPE_NONE = 0,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
enum dimre_method {
|
enum dimre_method {
|
||||||
DIMRE_METHOD_PCA,
|
DIMRE_METHOD_PCA,
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cpu_params {
|
// sampler parameters
|
||||||
int n_threads = -1;
|
struct gpt_sampler_params {
|
||||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
bool mask_valid = false; // Default: any CPU
|
|
||||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
bool strict_cpu = false; // Use strict CPU placement
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
bool ignore_eos = false;
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
|
|
||||||
|
// print the parameters into a string
|
||||||
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
|
||||||
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
@ -143,23 +186,23 @@ struct gpt_params {
|
|||||||
|
|
||||||
struct gpt_sampler_params sparams;
|
struct gpt_sampler_params sparams;
|
||||||
|
|
||||||
std::string model = ""; // model path
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
std::string model_url = ""; // model url to download
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
std::string hf_token = ""; // HF token
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
std::string hf_repo = ""; // HF repo
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
std::string hf_file = ""; // HF file
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
std::string prompt = "";
|
std::string prompt = ""; // NOLINT
|
||||||
std::string prompt_file = ""; // store the external prompt file name
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
std::string logits_file = ""; // file for saving *all* logits
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
@ -189,7 +232,6 @@ struct gpt_params {
|
|||||||
|
|
||||||
bool kl_divergence = false; // compute KL divergence
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
|
||||||
bool usage = false; // print usage
|
bool usage = false; // print usage
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool special = false; // enable special token output
|
bool special = false; // enable special token output
|
||||||
@ -220,7 +262,7 @@ struct gpt_params {
|
|||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
@ -236,15 +278,15 @@ struct gpt_params {
|
|||||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = "";
|
std::string public_path = ""; // NOLINT
|
||||||
std::string chat_template = "";
|
std::string chat_template = ""; // NOLINT
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = ""; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string ssl_file_key = "";
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
std::string ssl_file_cert = "";
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
bool endpoint_slots = true;
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
@ -299,92 +341,6 @@ struct gpt_params {
|
|||||||
bool batched_bench_output_jsonl = false;
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_arg {
|
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
|
||||||
std::vector<const char *> args;
|
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
|
||||||
const char * env = nullptr;
|
|
||||||
std::string help;
|
|
||||||
void (*handler_void) (gpt_params & params) = nullptr;
|
|
||||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
|
||||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
|
||||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, int)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params)
|
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
|
||||||
|
|
||||||
// support 2 values for arg
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const char * value_hint_2,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
|
||||||
|
|
||||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
|
||||||
this->examples = std::move(examples);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_arg & set_env(const char * env) {
|
|
||||||
help = help + "\n(env: " + env + ")";
|
|
||||||
this->env = env;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool in_example(enum llama_example ex) {
|
|
||||||
return examples.find(ex) != examples.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool get_value_from_env(std::string & output) const {
|
|
||||||
if (env == nullptr) return false;
|
|
||||||
char * value = std::getenv(env);
|
|
||||||
if (value) {
|
|
||||||
output = value;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool has_value_from_env() const {
|
|
||||||
return env != nullptr && std::getenv(env);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string to_string();
|
|
||||||
};
|
|
||||||
|
|
||||||
// initialize list of options (arguments) that can be used by the current example
|
|
||||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
|
||||||
// optionally, we can provide "print_usage" to print example usage
|
|
||||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
|
||||||
|
|
||||||
// parse input arguments from CLI
|
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
|
||||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
|
|
||||||
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
|
||||||
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
// TODO: deduplicate with llama-impl.h
|
// TODO: deduplicate with llama-impl.h
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -420,7 +423,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
@ -2,61 +2,11 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
enum gpt_sampler_type {
|
|
||||||
GPT_SAMPLER_TYPE_NONE = 0,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
|
||||||
};
|
|
||||||
|
|
||||||
// sampling parameters
|
|
||||||
struct gpt_sampler_params {
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
||||||
bool ignore_eos = false;
|
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
||||||
|
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
||||||
|
|
||||||
// print the parameters into a string
|
|
||||||
std::string print() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
// gpt_sampler extends llama_sampler with additional functionality:
|
// gpt_sampler extends llama_sampler with additional functionality:
|
||||||
//
|
//
|
||||||
// - grammar support
|
// - grammar support
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,12 +12,9 @@
|
|||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
|
|
||||||
#define DEBUG_POS 5
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
|
|||||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
auto options = gpt_params_parser_init(params, ex);
|
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||||
|
|
||||||
file << "| Argument | Explanation |\n";
|
file << "| Argument | Explanation |\n";
|
||||||
file << "| -------- | ----------- |\n";
|
file << "| -------- | ----------- |\n";
|
||||||
for (auto & opt : options) {
|
for (auto & opt : ctx_arg.options) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -153,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
params.verbosity = 1;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
|
|||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
|
#include "base64.hpp"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -36,8 +38,7 @@ struct ngram_container {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -13,8 +14,7 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
|||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
|
||||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@ -15,8 +16,7 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -12,8 +14,7 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submit requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_keep = 32;
|
params.n_keep = 32;
|
||||||
params.i_pos = -1;
|
params.i_pos = -1;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <atomic>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <random>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
|
||||||
#include <atomic>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <array>
|
|
||||||
#include <fstream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
params.sparams.seed = 1234;
|
params.sparams.seed = 1234;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--version` | show version and build info |
|
| `--version` | show version and build info |
|
||||||
| `-v, --verbose` | print verbose information |
|
| `-v, --verbose` | print verbose information |
|
||||||
| `--verbosity N` | set specific verbosity level (default: 0) |
|
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
|
||||||
| `--no-display-prompt` | don't print prompt at generation (default: false) |
|
|
||||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
|
||||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||||
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||||
|
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||||
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||||
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||||
|
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||||
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
|
||||||
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
|
||||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
|
|
||||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||||
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
|
|
||||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
| `--no-escape` | do not process escape sequences |
|
| `--no-escape` | do not process escape sequences |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||||
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||||
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
||||||
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
|
|
||||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||||
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||||
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||||
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||||
| `--timeout N` | server read/write timeout in seconds (default: 600) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -2423,8 +2425,7 @@ int main(int argc, char ** argv) {
|
|||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
@ -27,8 +29,7 @@ struct seq_draft {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -6,18 +9,16 @@
|
|||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||||
try {
|
try {
|
||||||
auto options = gpt_params_parser_init(params, (enum llama_example)ex);
|
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||||
std::unordered_set<std::string> seen_args;
|
std::unordered_set<std::string> seen_args;
|
||||||
std::unordered_set<std::string> seen_env_vars;
|
std::unordered_set<std::string> seen_env_vars;
|
||||||
for (const auto & opt : options) {
|
for (const auto & opt : ctx_arg.options) {
|
||||||
// check for args duplications
|
// check for args duplications
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
if (seen_args.find(arg) == seen_args.end()) {
|
if (seen_args.find(arg) == seen_args.end()) {
|
||||||
@ -52,40 +53,51 @@ int main(void) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::string> argv;
|
std::vector<std::string> argv;
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
|
||||||
|
|
||||||
printf("test-arg-parser: test invalid usage\n\n");
|
printf("test-arg-parser: test invalid usage\n\n");
|
||||||
|
|
||||||
|
// missing value
|
||||||
argv = {"binary_name", "-m"};
|
argv = {"binary_name", "-m"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// wrong value (int)
|
||||||
argv = {"binary_name", "-ngl", "hello"};
|
argv = {"binary_name", "-ngl", "hello"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// wrong value (enum)
|
||||||
argv = {"binary_name", "-sm", "hello"};
|
argv = {"binary_name", "-sm", "hello"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||||
|
argv = {"binary_name", "--draft", "123"};
|
||||||
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||||
|
|
||||||
|
|
||||||
printf("test-arg-parser: test valid usage\n\n");
|
printf("test-arg-parser: test valid usage\n\n");
|
||||||
|
|
||||||
argv = {"binary_name", "-m", "model_file.gguf"};
|
argv = {"binary_name", "-m", "model_file.gguf"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "model_file.gguf");
|
assert(params.model == "model_file.gguf");
|
||||||
|
|
||||||
argv = {"binary_name", "-t", "1234"};
|
argv = {"binary_name", "-t", "1234"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.cpuparams.n_threads == 1234);
|
assert(params.cpuparams.n_threads == 1234);
|
||||||
|
|
||||||
argv = {"binary_name", "--verbose"};
|
argv = {"binary_name", "--verbose"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.verbosity == 1);
|
assert(params.verbosity == 1);
|
||||||
|
|
||||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "abc.gguf");
|
assert(params.model == "abc.gguf");
|
||||||
assert(params.n_predict == 6789);
|
assert(params.n_predict == 6789);
|
||||||
assert(params.n_batch == 9090);
|
assert(params.n_batch == 9090);
|
||||||
|
|
||||||
|
// --draft cannot be used outside llama-speculative
|
||||||
|
argv = {"binary_name", "--draft", "123"};
|
||||||
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||||
|
assert(params.n_draft == 123);
|
||||||
|
|
||||||
// skip this part on windows, because setenv is not supported
|
// skip this part on windows, because setenv is not supported
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
printf("test-arg-parser: skip on windows build\n");
|
printf("test-arg-parser: skip on windows build\n");
|
||||||
@ -94,12 +106,12 @@ int main(void) {
|
|||||||
|
|
||||||
setenv("LLAMA_ARG_THREADS", "blah", true);
|
setenv("LLAMA_ARG_THREADS", "blah", true);
|
||||||
argv = {"binary_name"};
|
argv = {"binary_name"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||||
argv = {"binary_name"};
|
argv = {"binary_name"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "blah.gguf");
|
assert(params.model == "blah.gguf");
|
||||||
assert(params.cpuparams.n_threads == 1010);
|
assert(params.cpuparams.n_threads == 1010);
|
||||||
|
|
||||||
@ -109,7 +121,7 @@ int main(void) {
|
|||||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||||
argv = {"binary_name", "-m", "overwritten.gguf"};
|
argv = {"binary_name", "-m", "overwritten.gguf"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "overwritten.gguf");
|
assert(params.model == "overwritten.gguf");
|
||||||
assert(params.cpuparams.n_threads == 1010);
|
assert(params.cpuparams.n_threads == 1010);
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
|
Loading…
Reference in New Issue
Block a user