2024-09-09 23:36:09 +02:00
# include "arg.h"
2024-09-15 19:46:12 +02:00
# include "log.h"
2024-09-09 23:36:09 +02:00
# include "sampling.h"
# include <algorithm>
2024-09-15 19:46:12 +02:00
# include <climits>
# include <cstdarg>
2024-09-09 23:36:09 +02:00
# include <fstream>
# include <regex>
2024-09-15 19:46:12 +02:00
# include <set>
# include <string>
# include <thread>
# include <vector>
2024-09-09 23:36:09 +02:00
# include "json-schema-to-grammar.h"
using json = nlohmann : : ordered_json ;
2024-10-10 22:57:42 +02:00
common_arg & common_arg : : set_examples ( std : : initializer_list < enum llama_example > examples ) {
2024-09-09 23:36:09 +02:00
this - > examples = std : : move ( examples ) ;
return * this ;
}
2025-01-08 11:55:36 +01:00
common_arg & common_arg : : set_excludes ( std : : initializer_list < enum llama_example > excludes ) {
this - > excludes = std : : move ( excludes ) ;
return * this ;
}
2024-10-10 22:57:42 +02:00
common_arg & common_arg : : set_env ( const char * env ) {
2024-09-09 23:36:09 +02:00
help = help + " \n (env: " + env + " ) " ;
this - > env = env ;
return * this ;
}
2024-10-10 22:57:42 +02:00
common_arg & common_arg : : set_sparam ( ) {
2024-09-09 23:36:09 +02:00
is_sparam = true ;
return * this ;
}
2024-10-10 22:57:42 +02:00
bool common_arg : : in_example ( enum llama_example ex ) {
2024-09-09 23:36:09 +02:00
return examples . find ( ex ) ! = examples . end ( ) ;
}
2025-01-08 11:55:36 +01:00
bool common_arg : : is_exclude ( enum llama_example ex ) {
return excludes . find ( ex ) ! = excludes . end ( ) ;
}
2024-10-10 22:57:42 +02:00
bool common_arg : : get_value_from_env ( std : : string & output ) {
2024-09-09 23:36:09 +02:00
if ( env = = nullptr ) return false ;
char * value = std : : getenv ( env ) ;
if ( value ) {
output = value ;
return true ;
}
return false ;
}
2024-10-10 22:57:42 +02:00
bool common_arg : : has_value_from_env ( ) {
2024-09-09 23:36:09 +02:00
return env ! = nullptr & & std : : getenv ( env ) ;
}
static std : : vector < std : : string > break_str_into_lines ( std : : string input , size_t max_char_per_line ) {
std : : vector < std : : string > result ;
std : : istringstream iss ( input ) ;
std : : string line ;
auto add_line = [ & ] ( const std : : string & l ) {
if ( l . length ( ) < = max_char_per_line ) {
result . push_back ( l ) ;
} else {
std : : istringstream line_stream ( l ) ;
std : : string word , current_line ;
while ( line_stream > > word ) {
if ( current_line . length ( ) + ! current_line . empty ( ) + word . length ( ) > max_char_per_line ) {
if ( ! current_line . empty ( ) ) result . push_back ( current_line ) ;
current_line = word ;
} else {
current_line + = ( ! current_line . empty ( ) ? " " : " " ) + word ;
}
}
if ( ! current_line . empty ( ) ) result . push_back ( current_line ) ;
}
} ;
while ( std : : getline ( iss , line ) ) {
add_line ( line ) ;
}
return result ;
}
2024-10-10 22:57:42 +02:00
std : : string common_arg : : to_string ( ) {
2024-09-09 23:36:09 +02:00
// params for printing to console
const static int n_leading_spaces = 40 ;
const static int n_char_per_line_help = 70 ; // TODO: detect this based on current console
std : : string leading_spaces ( n_leading_spaces , ' ' ) ;
std : : ostringstream ss ;
for ( const auto arg : args ) {
if ( arg = = args . front ( ) ) {
if ( args . size ( ) = = 1 ) {
ss < < arg ;
} else {
// first arg is usually abbreviation, we need padding to make it more beautiful
auto tmp = std : : string ( arg ) + " , " ;
auto spaces = std : : string ( std : : max ( 0 , 7 - ( int ) tmp . size ( ) ) , ' ' ) ;
ss < < tmp < < spaces ;
}
} else {
ss < < arg < < ( arg ! = args . back ( ) ? " , " : " " ) ;
}
}
if ( value_hint ) ss < < " " < < value_hint ;
if ( value_hint_2 ) ss < < " " < < value_hint_2 ;
if ( ss . tellp ( ) > n_leading_spaces - 3 ) {
// current line is too long, add new line
ss < < " \n " < < leading_spaces ;
} else {
// padding between arg and help, same line
ss < < std : : string ( leading_spaces . size ( ) - ss . tellp ( ) , ' ' ) ;
}
const auto help_lines = break_str_into_lines ( help , n_char_per_line_help ) ;
for ( const auto & line : help_lines ) {
ss < < ( & line = = & help_lines . front ( ) ? " " : leading_spaces ) < < line < < " \n " ;
}
return ss . str ( ) ;
}
//
// utils
//
2024-12-18 18:27:21 +01:00
static void common_params_handle_model_default (
std : : string & model ,
2025-01-13 13:56:23 +01:00
const std : : string & model_url ,
2024-12-18 18:27:21 +01:00
std : : string & hf_repo ,
2025-01-13 13:56:23 +01:00
std : : string & hf_file ,
const std : : string & hf_token ) {
2024-12-18 18:27:21 +01:00
if ( ! hf_repo . empty ( ) ) {
2024-09-09 23:36:09 +02:00
// short-hand to avoid specifying --hf-file -> default it to --model
2024-12-18 18:27:21 +01:00
if ( hf_file . empty ( ) ) {
if ( model . empty ( ) ) {
2025-01-13 13:56:23 +01:00
auto auto_detected = common_get_hf_file ( hf_repo , hf_token ) ;
if ( auto_detected . first . empty ( ) | | auto_detected . second . empty ( ) ) {
exit ( 1 ) ; // built without CURL, error message already printed
}
hf_repo = auto_detected . first ;
hf_file = auto_detected . second ;
} else {
hf_file = model ;
2024-09-09 23:36:09 +02:00
}
2025-01-13 13:56:23 +01:00
}
// make sure model path is present (for caching purposes)
if ( model . empty ( ) ) {
2024-11-27 22:30:52 +01:00
// this is to avoid different repo having same file name, or same file name in different subdirs
2024-12-18 18:27:21 +01:00
std : : string filename = hf_repo + " _ " + hf_file ;
2024-11-27 22:30:52 +01:00
// to make sure we don't have any slashes in the filename
string_replace_all ( filename , " / " , " _ " ) ;
2024-12-18 18:27:21 +01:00
model = fs_get_cache_file ( filename ) ;
2024-09-09 23:36:09 +02:00
}
2024-12-18 18:27:21 +01:00
} else if ( ! model_url . empty ( ) ) {
if ( model . empty ( ) ) {
auto f = string_split < std : : string > ( model_url , ' # ' ) . front ( ) ;
2024-10-25 17:57:54 +02:00
f = string_split < std : : string > ( f , ' ? ' ) . front ( ) ;
2024-12-18 18:27:21 +01:00
model = fs_get_cache_file ( string_split < std : : string > ( f , ' / ' ) . back ( ) ) ;
2024-09-09 23:36:09 +02:00
}
2024-12-18 18:27:21 +01:00
} else if ( model . empty ( ) ) {
model = DEFAULT_MODEL_PATH ;
2024-09-09 23:36:09 +02:00
}
}
2024-12-12 22:53:05 +01:00
const std : : vector < ggml_type > kv_cache_types = {
GGML_TYPE_F32 ,
GGML_TYPE_F16 ,
GGML_TYPE_BF16 ,
GGML_TYPE_Q8_0 ,
GGML_TYPE_Q4_0 ,
GGML_TYPE_Q4_1 ,
GGML_TYPE_IQ4_NL ,
GGML_TYPE_Q5_0 ,
GGML_TYPE_Q5_1 ,
} ;
static ggml_type kv_cache_type_from_str ( const std : : string & s ) {
for ( const auto & type : kv_cache_types ) {
if ( ggml_type_name ( type ) = = s ) {
return type ;
}
}
throw std : : runtime_error ( " Unsupported cache type: " + s ) ;
}
static std : : string get_all_kv_cache_types ( ) {
std : : ostringstream msg ;
for ( const auto & type : kv_cache_types ) {
msg < < ggml_type_name ( type ) < < ( & type = = & kv_cache_types . back ( ) ? " " : " , " ) ;
}
return msg . str ( ) ;
}
2024-09-09 23:36:09 +02:00
//
// CLI argument parsing functions
//
2024-10-10 22:57:42 +02:00
static bool common_params_parse_ex ( int argc , char * * argv , common_params_context & ctx_arg ) {
2024-09-09 23:36:09 +02:00
std : : string arg ;
const std : : string arg_prefix = " -- " ;
2024-10-10 22:57:42 +02:00
common_params & params = ctx_arg . params ;
2024-09-09 23:36:09 +02:00
2024-10-10 22:57:42 +02:00
std : : unordered_map < std : : string , common_arg * > arg_to_options ;
2024-09-09 23:36:09 +02:00
for ( auto & opt : ctx_arg . options ) {
for ( const auto & arg : opt . args ) {
arg_to_options [ arg ] = & opt ;
}
}
// handle environment variables
for ( auto & opt : ctx_arg . options ) {
std : : string value ;
if ( opt . get_value_from_env ( value ) ) {
try {
if ( opt . handler_void & & ( value = = " 1 " | | value = = " true " ) ) {
opt . handler_void ( params ) ;
}
if ( opt . handler_int ) {
opt . handler_int ( params , std : : stoi ( value ) ) ;
}
if ( opt . handler_string ) {
opt . handler_string ( params , value ) ;
continue ;
}
} catch ( std : : exception & e ) {
2024-10-12 07:21:51 +02:00
throw std : : invalid_argument ( string_format (
2024-09-09 23:36:09 +02:00
" error while handling environment variable \" %s \" : %s \n \n " , opt . env , e . what ( ) ) ) ;
}
}
}
// handle command line arguments
auto check_arg = [ & ] ( int i ) {
if ( i + 1 > = argc ) {
throw std : : invalid_argument ( " expected value for argument " ) ;
}
} ;
for ( int i = 1 ; i < argc ; i + + ) {
const std : : string arg_prefix = " -- " ;
std : : string arg = argv [ i ] ;
if ( arg . compare ( 0 , arg_prefix . size ( ) , arg_prefix ) = = 0 ) {
std : : replace ( arg . begin ( ) , arg . end ( ) , ' _ ' , ' - ' ) ;
}
if ( arg_to_options . find ( arg ) = = arg_to_options . end ( ) ) {
2024-10-12 07:21:51 +02:00
throw std : : invalid_argument ( string_format ( " error: invalid argument: %s " , arg . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
auto opt = * arg_to_options [ arg ] ;
if ( opt . has_value_from_env ( ) ) {
fprintf ( stderr , " warn: %s environment variable is set, but will be overwritten by command line argument %s \n " , opt . env , arg . c_str ( ) ) ;
}
try {
if ( opt . handler_void ) {
opt . handler_void ( params ) ;
continue ;
}
// arg with single value
check_arg ( i ) ;
std : : string val = argv [ + + i ] ;
if ( opt . handler_int ) {
opt . handler_int ( params , std : : stoi ( val ) ) ;
continue ;
}
if ( opt . handler_string ) {
opt . handler_string ( params , val ) ;
continue ;
}
// arg with 2 values
check_arg ( i ) ;
std : : string val2 = argv [ + + i ] ;
if ( opt . handler_str_str ) {
opt . handler_str_str ( params , val , val2 ) ;
continue ;
}
} catch ( std : : exception & e ) {
2024-10-12 07:21:51 +02:00
throw std : : invalid_argument ( string_format (
2024-09-09 23:36:09 +02:00
" error while handling argument \" %s \" : %s \n \n "
" usage: \n %s \n \n to show complete usage, run with -h " ,
arg . c_str ( ) , e . what ( ) , arg_to_options [ arg ] - > to_string ( ) . c_str ( ) ) ) ;
}
}
2024-11-25 08:58:41 +01:00
postprocess_cpu_params ( params . cpuparams , nullptr ) ;
2024-09-09 23:36:09 +02:00
postprocess_cpu_params ( params . cpuparams_batch , & params . cpuparams ) ;
2024-11-25 08:58:41 +01:00
postprocess_cpu_params ( params . speculative . cpuparams , & params . cpuparams ) ;
postprocess_cpu_params ( params . speculative . cpuparams_batch , & params . cpuparams_batch ) ;
2024-09-09 23:36:09 +02:00
if ( params . prompt_cache_all & & ( params . interactive | | params . interactive_first ) ) {
throw std : : invalid_argument ( " error: --prompt-cache-all not supported in interactive mode yet \n " ) ;
}
2024-12-18 18:27:21 +01:00
// TODO: refactor model params in a common struct
2025-01-13 13:56:23 +01:00
common_params_handle_model_default ( params . model , params . model_url , params . hf_repo , params . hf_file , params . hf_token ) ;
common_params_handle_model_default ( params . vocoder . model , params . vocoder . model_url , params . vocoder . hf_repo , params . vocoder . hf_file , params . hf_token ) ;
2024-09-09 23:36:09 +02:00
if ( params . escape ) {
string_process_escapes ( params . prompt ) ;
string_process_escapes ( params . input_prefix ) ;
string_process_escapes ( params . input_suffix ) ;
for ( auto & antiprompt : params . antiprompt ) {
string_process_escapes ( antiprompt ) ;
}
2024-11-25 08:58:41 +01:00
for ( auto & seq_breaker : params . sampling . dry_sequence_breakers ) {
2024-10-25 18:07:34 +02:00
string_process_escapes ( seq_breaker ) ;
}
2024-09-09 23:36:09 +02:00
}
if ( ! params . kv_overrides . empty ( ) ) {
params . kv_overrides . emplace_back ( ) ;
params . kv_overrides . back ( ) . key [ 0 ] = 0 ;
}
2024-09-28 16:42:03 +02:00
if ( params . reranking & & params . embedding ) {
throw std : : invalid_argument ( " error: either --embedding or --reranking can be specified, but not both " ) ;
}
2024-09-09 23:36:09 +02:00
return true ;
}
2024-10-10 22:57:42 +02:00
static void common_params_print_usage ( common_params_context & ctx_arg ) {
auto print_options = [ ] ( std : : vector < common_arg * > & options ) {
for ( common_arg * opt : options ) {
2024-09-09 23:36:09 +02:00
printf ( " %s " , opt - > to_string ( ) . c_str ( ) ) ;
}
} ;
2024-10-10 22:57:42 +02:00
std : : vector < common_arg * > common_options ;
std : : vector < common_arg * > sparam_options ;
std : : vector < common_arg * > specific_options ;
2024-09-09 23:36:09 +02:00
for ( auto & opt : ctx_arg . options ) {
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
if ( opt . is_sparam ) {
sparam_options . push_back ( & opt ) ;
} else if ( opt . in_example ( ctx_arg . ex ) ) {
specific_options . push_back ( & opt ) ;
} else {
common_options . push_back ( & opt ) ;
}
}
printf ( " ----- common params ----- \n \n " ) ;
print_options ( common_options ) ;
printf ( " \n \n ----- sampling params ----- \n \n " ) ;
print_options ( sparam_options ) ;
// TODO: maybe convert enum llama_example to string
printf ( " \n \n ----- example-specific params ----- \n \n " ) ;
print_options ( specific_options ) ;
}
2024-11-25 19:30:06 +01:00
static std : : vector < ggml_backend_dev_t > parse_device_list ( const std : : string & value ) {
std : : vector < ggml_backend_dev_t > devices ;
auto dev_names = string_split < std : : string > ( value , ' , ' ) ;
if ( dev_names . empty ( ) ) {
throw std : : invalid_argument ( " no devices specified " ) ;
}
if ( dev_names . size ( ) = = 1 & & dev_names [ 0 ] = = " none " ) {
devices . push_back ( nullptr ) ;
} else {
for ( const auto & device : dev_names ) {
auto * dev = ggml_backend_dev_by_name ( device . c_str ( ) ) ;
if ( ! dev | | ggml_backend_dev_type ( dev ) ! = GGML_BACKEND_DEVICE_TYPE_GPU ) {
throw std : : invalid_argument ( string_format ( " invalid device: %s " , device . c_str ( ) ) ) ;
}
devices . push_back ( dev ) ;
}
devices . push_back ( nullptr ) ;
}
return devices ;
}
2024-10-10 22:57:42 +02:00
bool common_params_parse ( int argc , char * * argv , common_params & params , llama_example ex , void ( * print_usage ) ( int , char * * ) ) {
auto ctx_arg = common_params_parser_init ( params , ex , print_usage ) ;
const common_params params_org = ctx_arg . params ; // the example can modify the default params
2024-09-09 23:36:09 +02:00
try {
2024-10-10 22:57:42 +02:00
if ( ! common_params_parse_ex ( argc , argv , ctx_arg ) ) {
2024-09-09 23:36:09 +02:00
ctx_arg . params = params_org ;
return false ;
}
if ( ctx_arg . params . usage ) {
2024-10-10 22:57:42 +02:00
common_params_print_usage ( ctx_arg ) ;
2024-09-09 23:36:09 +02:00
if ( ctx_arg . print_usage ) {
ctx_arg . print_usage ( argc , argv ) ;
}
exit ( 0 ) ;
}
} catch ( const std : : invalid_argument & ex ) {
fprintf ( stderr , " %s \n " , ex . what ( ) ) ;
ctx_arg . params = params_org ;
return false ;
}
return true ;
}
2024-12-02 22:10:19 +01:00
static std : : string list_builtin_chat_templates ( ) {
std : : vector < const char * > supported_tmpl ;
int32_t res = llama_chat_builtin_templates ( nullptr , 0 ) ;
supported_tmpl . resize ( res ) ;
res = llama_chat_builtin_templates ( supported_tmpl . data ( ) , supported_tmpl . size ( ) ) ;
std : : ostringstream msg ;
for ( auto & tmpl : supported_tmpl ) {
msg < < tmpl < < ( & tmpl = = & supported_tmpl . back ( ) ? " " : " , " ) ;
}
return msg . str ( ) ;
}
2024-10-10 22:57:42 +02:00
common_params_context common_params_parser_init ( common_params & params , llama_example ex , void ( * print_usage ) ( int , char * * ) ) {
2024-11-25 19:30:06 +01:00
// load dynamic backends
ggml_backend_load_all ( ) ;
2024-10-10 22:57:42 +02:00
common_params_context ctx_arg ( params ) ;
2024-09-09 23:36:09 +02:00
ctx_arg . print_usage = print_usage ;
ctx_arg . ex = ex ;
std : : string sampler_type_chars ;
std : : string sampler_type_names ;
2024-11-25 08:58:41 +01:00
for ( const auto & sampler : params . sampling . samplers ) {
2024-10-10 22:57:42 +02:00
sampler_type_chars + = common_sampler_type_to_chr ( sampler ) ;
sampler_type_names + = common_sampler_type_to_str ( sampler ) + " ; " ;
2024-09-09 23:36:09 +02:00
}
sampler_type_names . pop_back ( ) ;
/**
* filter options by example
* rules :
* - all examples inherit options from LLAMA_EXAMPLE_COMMON
* - if LLAMA_EXAMPLE_ * is set ( other than COMMON ) , we only show the option in the corresponding example
* - if both { LLAMA_EXAMPLE_COMMON , LLAMA_EXAMPLE_ * , } are set , we will prioritize the LLAMA_EXAMPLE_ * matching current example
*/
2024-10-10 22:57:42 +02:00
auto add_opt = [ & ] ( common_arg arg ) {
2025-01-08 11:55:36 +01:00
if ( ( arg . in_example ( ex ) | | arg . in_example ( LLAMA_EXAMPLE_COMMON ) ) & & ! arg . is_exclude ( ex ) ) {
2024-09-09 23:36:09 +02:00
ctx_arg . options . push_back ( std : : move ( arg ) ) ;
}
} ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -h " , " --help " , " --usage " } ,
" print usage and exit " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . usage = true ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --version " } ,
" show version and build info " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & ) {
2024-09-09 23:36:09 +02:00
fprintf ( stderr , " version: %d (%s) \n " , LLAMA_BUILD_NUMBER , LLAMA_COMMIT ) ;
fprintf ( stderr , " built with %s for %s \n " , LLAMA_COMPILER , LLAMA_BUILD_TARGET ) ;
exit ( 0 ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --verbose-prompt " } ,
2024-10-12 07:21:51 +02:00
string_format ( " print a verbose prompt before generation (default: %s) " , params . verbose_prompt ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . verbose_prompt = true ;
}
2024-09-28 16:42:03 +02:00
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-display-prompt " } ,
2024-10-12 07:21:51 +02:00
string_format ( " don't print prompt at generation (default: %s) " , ! params . display_prompt ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . display_prompt = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -co " , " --color " } ,
2024-10-12 07:21:51 +02:00
string_format ( " colorise output to distinguish prompt and user input from generations (default: %s) " , params . use_color ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . use_color = true ;
}
2024-09-15 19:46:12 +02:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_INFILL , LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_LOOKUP } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -t " , " --threads " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of threads to use during generation (default: %d) " , params . cpuparams . n_threads ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams . n_threads = value ;
if ( params . cpuparams . n_threads < = 0 ) {
params . cpuparams . n_threads = std : : thread : : hardware_concurrency ( ) ;
}
}
) . set_env ( " LLAMA_ARG_THREADS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -tb " , " --threads-batch " } , " N " ,
" number of threads to use during batch and prompt processing (default: same as --threads) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams_batch . n_threads = value ;
if ( params . cpuparams_batch . n_threads < = 0 ) {
params . cpuparams_batch . n_threads = std : : thread : : hardware_concurrency ( ) ;
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -C " , " --cpu-mask " } , " M " ,
" CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \" \" ) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & mask ) {
2024-09-09 23:36:09 +02:00
params . cpuparams . mask_valid = true ;
if ( ! parse_cpu_mask ( mask , params . cpuparams . cpumask ) ) {
throw std : : invalid_argument ( " invalid cpumask " ) ;
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -Cr " , " --cpu-range " } , " lo-hi " ,
" range of CPUs for affinity. Complements --cpu-mask " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & range ) {
2024-09-09 23:36:09 +02:00
params . cpuparams . mask_valid = true ;
if ( ! parse_cpu_range ( range , params . cpuparams . cpumask ) ) {
throw std : : invalid_argument ( " invalid range " ) ;
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --cpu-strict " } , " <0|1> " ,
2024-10-12 07:21:51 +02:00
string_format ( " use strict CPU placement (default: %u) \n " , ( unsigned ) params . cpuparams . strict_cpu ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams . strict_cpu = std : : stoul ( value ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --prio " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d) \n " , params . cpuparams . priority ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int prio ) {
2024-09-09 23:36:09 +02:00
if ( prio < 0 | | prio > 3 ) {
throw std : : invalid_argument ( " invalid value " ) ;
}
params . cpuparams . priority = ( enum ggml_sched_priority ) prio ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --poll " } , " <0...100> " ,
2024-10-12 07:21:51 +02:00
string_format ( " use polling level to wait for work (0 - no polling, default: %u) \n " , ( unsigned ) params . cpuparams . poll ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams . poll = std : : stoul ( value ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -Cb " , " --cpu-mask-batch " } , " M " ,
" CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & mask ) {
2024-09-09 23:36:09 +02:00
params . cpuparams_batch . mask_valid = true ;
if ( ! parse_cpu_mask ( mask , params . cpuparams_batch . cpumask ) ) {
throw std : : invalid_argument ( " invalid cpumask " ) ;
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -Crb " , " --cpu-range-batch " } , " lo-hi " ,
" ranges of CPUs for affinity. Complements --cpu-mask-batch " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & range ) {
2024-09-09 23:36:09 +02:00
params . cpuparams_batch . mask_valid = true ;
if ( ! parse_cpu_range ( range , params . cpuparams_batch . cpumask ) ) {
throw std : : invalid_argument ( " invalid range " ) ;
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --cpu-strict-batch " } , " <0|1> " ,
" use strict CPU placement (default: same as --cpu-strict) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams_batch . strict_cpu = value ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --prio-batch " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d) \n " , params . cpuparams_batch . priority ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int prio ) {
2024-09-09 23:36:09 +02:00
if ( prio < 0 | | prio > 3 ) {
throw std : : invalid_argument ( " invalid value " ) ;
}
params . cpuparams_batch . priority = ( enum ggml_sched_priority ) prio ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --poll-batch " } , " <0|1> " ,
" use polling to wait for work (default: same as --poll) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . cpuparams_batch . poll = value ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -lcs " , " --lookup-cache-static " } , " FNAME " ,
" path to static lookup cache to use for lookup decoding (not updated by generation) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . lookup_cache_static = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_LOOKUP } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -lcd " , " --lookup-cache-dynamic " } , " FNAME " ,
" path to dynamic lookup cache to use for lookup decoding (updated by generation) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . lookup_cache_dynamic = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_LOOKUP } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -c " , " --ctx-size " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " size of the prompt context (default: %d, 0 = loaded from model) " , params . n_ctx ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_ctx = value ;
}
) . set_env ( " LLAMA_ARG_CTX_SIZE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -n " , " --predict " , " --n-predict " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled) " , params . n_predict ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_predict = value ;
}
) . set_env ( " LLAMA_ARG_N_PREDICT " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -b " , " --batch-size " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " logical maximum batch size (default: %d) " , params . n_batch ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_batch = value ;
}
) . set_env ( " LLAMA_ARG_BATCH " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ub " , " --ubatch-size " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " physical maximum batch size (default: %d) " , params . n_ubatch ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_ubatch = value ;
}
) . set_env ( " LLAMA_ARG_UBATCH " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --keep " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of tokens to keep from the initial prompt (default: %d, -1 = all) " , params . n_keep ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_keep = value ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-16 08:20:01 +02:00
{ " --no-context-shift " } ,
2024-10-12 07:21:51 +02:00
string_format ( " disables context shift on inifinite text generation (default: %s) " , params . ctx_shift ? " disabled " : " enabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-16 08:20:01 +02:00
params . ctx_shift = false ;
}
2024-12-20 10:44:58 +01:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_SERVER , LLAMA_EXAMPLE_IMATRIX , LLAMA_EXAMPLE_PERPLEXITY } ) . set_env ( " LLAMA_ARG_NO_CONTEXT_SHIFT " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --chunks " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " max number of chunks to process (default: %d, -1 = all) " , params . n_chunks ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_chunks = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX , LLAMA_EXAMPLE_PERPLEXITY , LLAMA_EXAMPLE_RETRIEVAL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -fa " , " --flash-attn " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable Flash Attention (default: %s) " , params . flash_attn ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . flash_attn = true ;
}
) . set_env ( " LLAMA_ARG_FLASH_ATTN " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -p " , " --prompt " } , " PROMPT " ,
ex = = LLAMA_EXAMPLE_MAIN
? " prompt to start generation with \n if -cnv is set, this will be used as system prompt "
: " prompt to start generation with " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . prompt = value ;
}
2025-01-08 11:55:36 +01:00
) . set_excludes ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-13 08:53:38 +02:00
{ " --no-perf " } ,
2024-10-12 07:21:51 +02:00
string_format ( " disable internal libllama performance timings (default: %s) " , params . no_perf ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-13 08:53:38 +02:00
params . no_perf = true ;
2024-11-25 08:58:41 +01:00
params . sampling . no_perf = true ;
2024-09-13 08:53:38 +02:00
}
) . set_env ( " LLAMA_ARG_NO_PERF " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -f " , " --file " } , " FNAME " ,
" a file containing the prompt (default: none) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream file ( value ) ;
if ( ! file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
// store the external file name in params
params . prompt_file = value ;
std : : copy ( std : : istreambuf_iterator < char > ( file ) , std : : istreambuf_iterator < char > ( ) , back_inserter ( params . prompt ) ) ;
if ( ! params . prompt . empty ( ) & & params . prompt . back ( ) = = ' \n ' ) {
params . prompt . pop_back ( ) ;
}
}
2025-01-08 11:55:36 +01:00
) . set_excludes ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --in-file " } , " FNAME " ,
" an input file (repeat to specify multiple files) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream file ( value ) ;
if ( ! file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
params . in_files . push_back ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -bf " , " --binary-file " } , " FNAME " ,
" binary file containing the prompt (default: none) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream file ( value , std : : ios : : binary ) ;
if ( ! file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
// store the external file name in params
params . prompt_file = value ;
std : : ostringstream ss ;
ss < < file . rdbuf ( ) ;
params . prompt = ss . str ( ) ;
fprintf ( stderr , " Read %zu bytes from binary file %s \n " , params . prompt . size ( ) , value . c_str ( ) ) ;
}
2025-01-08 11:55:36 +01:00
) . set_excludes ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -e " , " --escape " } ,
2024-10-12 07:21:51 +02:00
string_format ( " process escapes sequences ( \\ n, \\ r, \\ t, \\ ', \\ \" , \\ \\ ) (default: %s) " , params . escape ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . escape = true ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-escape " } ,
" do not process escape sequences " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . escape = false ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ptc " , " --print-token-count " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " print token count every N tokens (default: %d) " , params . n_print ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_print = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --prompt-cache " } , " FNAME " ,
" file to cache prompt state for faster startup (default: none) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . path_prompt_cache = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --prompt-cache-all " } ,
" if specified, saves user input and generations to cache as well \n " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . prompt_cache_all = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --prompt-cache-ro " } ,
" if specified, uses the prompt cache but does not update it " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . prompt_cache_ro = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -r " , " --reverse-prompt " } , " PROMPT " ,
" halt generation at PROMPT, return control in interactive mode \n " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . antiprompt . emplace_back ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -sp " , " --special " } ,
2024-10-12 07:21:51 +02:00
string_format ( " special tokens output enabled (default: %s) " , params . special ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . special = true ;
}
2024-09-10 22:40:59 +02:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -cnv " , " --conversation " } ,
2025-01-13 20:18:12 +01:00
" run in conversation mode: \n "
" - does not print special tokens and suffix/prefix \n "
" - interactive mode is also enabled \n "
" (default: auto enabled if chat template is available) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2025-01-13 20:18:12 +01:00
params . conversation_mode = COMMON_CONVERSATION_MODE_ENABLED ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
add_opt ( common_arg (
{ " -no-cnv " , " --no-conversation " } ,
" force disable conversation mode (default: false) " ,
[ ] ( common_params & params ) {
params . conversation_mode = COMMON_CONVERSATION_MODE_DISABLED ;
2024-09-09 23:36:09 +02:00
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -i " , " --interactive " } ,
2024-10-12 07:21:51 +02:00
string_format ( " run in interactive mode (default: %s) " , params . interactive ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . interactive = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -if " , " --interactive-first " } ,
2024-10-12 07:21:51 +02:00
string_format ( " run in interactive mode and wait for input right away (default: %s) " , params . interactive_first ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . interactive_first = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -mli " , " --multiline-input " } ,
" allows you to write or paste multiple lines without ending each in ' \\ ' " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . multiline_input = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --in-prefix-bos " } ,
" prefix BOS to user inputs, preceding the `--in-prefix` string " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . input_prefix_bos = true ;
params . enable_chat_template = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --in-prefix " } , " STRING " ,
" string to prefix user inputs with (default: empty) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . input_prefix = value ;
params . enable_chat_template = false ;
}
2024-09-15 19:46:12 +02:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_INFILL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --in-suffix " } , " STRING " ,
" string to suffix after user inputs with (default: empty) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . input_suffix = value ;
params . enable_chat_template = false ;
}
2024-09-15 19:46:12 +02:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_INFILL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-warmup " } ,
" skip warming up the model with an empty run " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . warmup = false ;
}
2024-12-06 13:29:05 +01:00
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --spm-infill " } ,
2024-10-12 07:21:51 +02:00
string_format (
2024-09-09 23:36:09 +02:00
" use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s) " ,
params . spm_infill ? " enabled " : " disabled "
) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . spm_infill = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER , LLAMA_EXAMPLE_INFILL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --samplers " } , " SAMPLERS " ,
2024-10-12 07:21:51 +02:00
string_format ( " samplers that will be used for generation in the order, separated by \' ; \' \n (default: %s) " , sampler_type_names . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-10-25 17:57:54 +02:00
const auto sampler_names = string_split < std : : string > ( value , ' ; ' ) ;
2024-11-25 08:58:41 +01:00
params . sampling . samplers = common_sampler_types_from_names ( sampler_names , true ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -s " , " --seed " } , " SEED " ,
2024-11-25 08:58:41 +01:00
string_format ( " RNG seed (default: %d, use random seed for %d) " , params . sampling . seed , LLAMA_DEFAULT_SEED ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . seed = std : : stoul ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-12-18 18:27:21 +01:00
{ " --sampling-seq " , " --sampler-seq " } , " SEQUENCE " ,
2024-10-12 07:21:51 +02:00
string_format ( " simplified sequence for samplers that will be used (default: %s) " , sampler_type_chars . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . samplers = common_sampler_types_from_chars ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --ignore-eos " } ,
" ignore end of stream token and continue generating (implies --logit-bias EOS-inf) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-11-25 08:58:41 +01:00
params . sampling . ignore_eos = true ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --temp " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " temperature (default: %.1f) " , ( double ) params . sampling . temp ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . temp = std : : stof ( value ) ;
params . sampling . temp = std : : max ( params . sampling . temp , 0.0f ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --top-k " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " top-k sampling (default: %d, 0 = disabled) " , params . sampling . top_k ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-11-25 08:58:41 +01:00
params . sampling . top_k = value ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --top-p " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " top-p sampling (default: %.1f, 1.0 = disabled) " , ( double ) params . sampling . top_p ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . top_p = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --min-p " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " min-p sampling (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . min_p ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . min_p = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-15 12:54:55 +02:00
add_opt ( common_arg (
{ " --xtc-probability " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " xtc probability (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . xtc_probability ) ,
2024-10-15 12:54:55 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . xtc_probability = std : : stof ( value ) ;
2024-10-15 12:54:55 +02:00
}
) . set_sparam ( ) ) ;
add_opt ( common_arg (
{ " --xtc-threshold " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " xtc threshold (default: %.1f, 1.0 = disabled) " , ( double ) params . sampling . xtc_threshold ) ,
2024-10-15 12:54:55 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . xtc_threshold = std : : stof ( value ) ;
2024-10-15 12:54:55 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --typical " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " locally typical sampling, parameter p (default: %.1f, 1.0 = disabled) " , ( double ) params . sampling . typ_p ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . typ_p = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --repeat-last-n " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) " , params . sampling . penalty_last_n ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-12-16 11:31:14 +01:00
if ( value < - 1 ) {
throw std : : runtime_error ( string_format ( " error: invalid repeat-last-n = %d \n " , value ) ) ;
}
2024-11-25 08:58:41 +01:00
params . sampling . penalty_last_n = value ;
params . sampling . n_prev = std : : max ( params . sampling . n_prev , params . sampling . penalty_last_n ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --repeat-penalty " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled) " , ( double ) params . sampling . penalty_repeat ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . penalty_repeat = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --presence-penalty " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " repeat alpha presence penalty (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . penalty_present ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . penalty_present = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --frequency-penalty " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " repeat alpha frequency penalty (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . penalty_freq ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . penalty_freq = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-25 18:07:34 +02:00
add_opt ( common_arg (
{ " --dry-multiplier " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " set DRY sampling multiplier (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . dry_multiplier ) ,
2024-10-25 18:07:34 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . dry_multiplier = std : : stof ( value ) ;
2024-10-25 18:07:34 +02:00
}
) . set_sparam ( ) ) ;
add_opt ( common_arg (
{ " --dry-base " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " set DRY sampling base value (default: %.2f) " , ( double ) params . sampling . dry_base ) ,
2024-10-25 18:07:34 +02:00
[ ] ( common_params & params , const std : : string & value ) {
float potential_base = std : : stof ( value ) ;
if ( potential_base > = 1.0f )
{
2024-11-25 08:58:41 +01:00
params . sampling . dry_base = potential_base ;
2024-10-25 18:07:34 +02:00
}
}
) . set_sparam ( ) ) ;
add_opt ( common_arg (
{ " --dry-allowed-length " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " set allowed length for DRY sampling (default: %d) " , params . sampling . dry_allowed_length ) ,
2024-10-25 18:07:34 +02:00
[ ] ( common_params & params , int value ) {
2024-11-25 08:58:41 +01:00
params . sampling . dry_allowed_length = value ;
2024-10-25 18:07:34 +02:00
}
) . set_sparam ( ) ) ;
add_opt ( common_arg (
{ " --dry-penalty-last-n " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size) " , params . sampling . dry_penalty_last_n ) ,
2024-10-25 18:07:34 +02:00
[ ] ( common_params & params , int value ) {
2024-12-16 11:31:14 +01:00
if ( value < - 1 ) {
throw std : : runtime_error ( string_format ( " error: invalid dry-penalty-last-n = %d \n " , value ) ) ;
}
2024-11-25 08:58:41 +01:00
params . sampling . dry_penalty_last_n = value ;
2024-10-25 18:07:34 +02:00
}
) . set_sparam ( ) ) ;
add_opt ( common_arg (
{ " --dry-sequence-breaker " } , " STRING " ,
string_format ( " add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \" none \" to not use any sequence breakers \n " ,
2024-11-25 08:58:41 +01:00
params . sampling . dry_sequence_breakers . empty ( ) ? " none " :
std : : accumulate ( std : : next ( params . sampling . dry_sequence_breakers . begin ( ) ) ,
params . sampling . dry_sequence_breakers . end ( ) ,
std : : string ( " ' " ) + ( params . sampling . dry_sequence_breakers [ 0 ] = = " \n " ? " \\ n " : params . sampling . dry_sequence_breakers [ 0 ] ) + " ' " ,
2024-10-25 18:07:34 +02:00
[ ] ( const std : : string & a , const std : : string & b ) {
std : : string formatted_b = ( b = = " \n " ) ? " \\ n " : b ;
return a + " , ' " + formatted_b + " ' " ;
} ) . c_str ( ) ) ,
[ ] ( common_params & params , const std : : string & value ) {
static bool defaults_cleared = false ;
if ( ! defaults_cleared ) {
2024-11-25 08:58:41 +01:00
params . sampling . dry_sequence_breakers . clear ( ) ;
2024-10-25 18:07:34 +02:00
defaults_cleared = true ;
}
if ( value = = " none " ) {
2024-11-25 08:58:41 +01:00
params . sampling . dry_sequence_breakers . clear ( ) ;
2024-10-25 18:07:34 +02:00
} else {
2024-11-25 08:58:41 +01:00
params . sampling . dry_sequence_breakers . emplace_back ( value ) ;
2024-10-25 18:07:34 +02:00
}
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --dynatemp-range " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " dynamic temperature range (default: %.1f, 0.0 = disabled) " , ( double ) params . sampling . dynatemp_range ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . dynatemp_range = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --dynatemp-exp " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " dynamic temperature exponent (default: %.1f) " , ( double ) params . sampling . dynatemp_exponent ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . dynatemp_exponent = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --mirostat " } , " N " ,
2024-10-29 09:42:05 +01:00
string_format ( " use Mirostat sampling. \n Top K, Nucleus and Locally Typical samplers are ignored if used. \n "
2024-11-25 08:58:41 +01:00
" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) " , params . sampling . mirostat ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-11-25 08:58:41 +01:00
params . sampling . mirostat = value ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --mirostat-lr " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " Mirostat learning rate, parameter eta (default: %.1f) " , ( double ) params . sampling . mirostat_eta ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . mirostat_eta = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --mirostat-ent " } , " N " ,
2024-11-25 08:58:41 +01:00
string_format ( " Mirostat target entropy, parameter tau (default: %.1f) " , ( double ) params . sampling . mirostat_tau ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . mirostat_tau = std : : stof ( value ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -l " , " --logit-bias " } , " TOKEN_ID(+/-)BIAS " ,
" modifies the likelihood of token appearing in the completion, \n "
" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', \n "
" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : stringstream ss ( value ) ;
llama_token key ;
char sign ;
std : : string value_str ;
try {
if ( ss > > key & & ss > > sign & & std : : getline ( ss , value_str ) & & ( sign = = ' + ' | | sign = = ' - ' ) ) {
const float bias = std : : stof ( value_str ) * ( ( sign = = ' - ' ) ? - 1.0f : 1.0f ) ;
2024-11-25 08:58:41 +01:00
params . sampling . logit_bias . push_back ( { key , bias } ) ;
2024-09-09 23:36:09 +02:00
} else {
throw std : : invalid_argument ( " invalid input format " ) ;
}
} catch ( const std : : exception & ) {
throw std : : invalid_argument ( " invalid input format " ) ;
}
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --grammar " } , " GRAMMAR " ,
2024-11-25 08:58:41 +01:00
string_format ( " BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s') " , params . sampling . grammar . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . grammar = value ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --grammar-file " } , " FNAME " ,
" file to read grammar from " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream file ( value ) ;
if ( ! file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
std : : copy (
std : : istreambuf_iterator < char > ( file ) ,
std : : istreambuf_iterator < char > ( ) ,
2024-11-25 08:58:41 +01:00
std : : back_inserter ( params . sampling . grammar )
2024-09-09 23:36:09 +02:00
) ;
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -j " , " --json-schema " } , " SCHEMA " ,
" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object \n For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-11-25 08:58:41 +01:00
params . sampling . grammar = json_schema_to_grammar ( json : : parse ( value ) ) ;
2024-09-09 23:36:09 +02:00
}
) . set_sparam ( ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-28 16:42:03 +02:00
{ " --pooling " } , " {none,mean,cls,last,rank} " ,
2024-09-09 23:36:09 +02:00
" pooling type for embeddings, use model default if unspecified " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " none " ) { params . pooling_type = LLAMA_POOLING_TYPE_NONE ; }
else if ( value = = " mean " ) { params . pooling_type = LLAMA_POOLING_TYPE_MEAN ; }
2024-09-28 16:42:03 +02:00
else if ( value = = " cls " ) { params . pooling_type = LLAMA_POOLING_TYPE_CLS ; }
2024-09-09 23:36:09 +02:00
else if ( value = = " last " ) { params . pooling_type = LLAMA_POOLING_TYPE_LAST ; }
2024-09-28 16:42:03 +02:00
else if ( value = = " rank " ) { params . pooling_type = LLAMA_POOLING_TYPE_RANK ; }
2024-09-09 23:36:09 +02:00
else { throw std : : invalid_argument ( " invalid value " ) ; }
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_EMBEDDING , LLAMA_EXAMPLE_RETRIEVAL , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_POOLING " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-10-21 20:12:52 +02:00
{ " --attention " } , " {causal,non-causal} " ,
2024-09-09 23:36:09 +02:00
" attention type for embeddings, use model default if unspecified " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " causal " ) { params . attention_type = LLAMA_ATTENTION_TYPE_CAUSAL ; }
else if ( value = = " non-causal " ) { params . attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL ; }
else { throw std : : invalid_argument ( " invalid value " ) ; }
}
) . set_examples ( { LLAMA_EXAMPLE_EMBEDDING } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --rope-scaling " } , " {none,linear,yarn} " ,
" RoPE frequency scaling method, defaults to linear unless specified by the model " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " none " ) { params . rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE ; }
else if ( value = = " linear " ) { params . rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR ; }
else if ( value = = " yarn " ) { params . rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN ; }
else { throw std : : invalid_argument ( " invalid value " ) ; }
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_ROPE_SCALING_TYPE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --rope-scale " } , " N " ,
" RoPE context scaling factor, expands context by a factor of N " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . rope_freq_scale = 1.0f / std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_ROPE_SCALE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --rope-freq-base " } , " N " ,
" RoPE base frequency, used by NTK-aware scaling (default: loaded from model) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . rope_freq_base = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_ROPE_FREQ_BASE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --rope-freq-scale " } , " N " ,
" RoPE frequency scaling factor, expands context by a factor of 1/N " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . rope_freq_scale = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_ROPE_FREQ_SCALE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --yarn-orig-ctx " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " YaRN: original context size of model (default: %d = model training context size) " , params . yarn_orig_ctx ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . yarn_orig_ctx = value ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_YARN_ORIG_CTX " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --yarn-ext-factor " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation) " , ( double ) params . yarn_ext_factor ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . yarn_ext_factor = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_YARN_EXT_FACTOR " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --yarn-attn-factor " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " YaRN: scale sqrt(t) or attention magnitude (default: %.1f) " , ( double ) params . yarn_attn_factor ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . yarn_attn_factor = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_YARN_ATTN_FACTOR " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --yarn-beta-slow " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " YaRN: high correction dim or alpha (default: %.1f) " , ( double ) params . yarn_beta_slow ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . yarn_beta_slow = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_YARN_BETA_SLOW " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --yarn-beta-fast " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " YaRN: low correction dim or beta (default: %.1f) " , ( double ) params . yarn_beta_fast ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . yarn_beta_fast = std : : stof ( value ) ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_YARN_BETA_FAST " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -gan " , " --grp-attn-n " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " group-attention factor (default: %d) " , params . grp_attn_n ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . grp_attn_n = value ;
}
2024-10-12 15:06:31 +02:00
) . set_env ( " LLAMA_ARG_GRP_ATTN_N " ) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_PASSKEY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -gaw " , " --grp-attn-w " } , " N " ,
2024-10-12 15:06:31 +02:00
string_format ( " group-attention width (default: %d) " , params . grp_attn_w ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . grp_attn_w = value ;
}
2024-10-12 15:06:31 +02:00
) . set_env ( " LLAMA_ARG_GRP_ATTN_W " ) . set_examples ( { LLAMA_EXAMPLE_MAIN } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -dkvc " , " --dump-kv-cache " } ,
" verbose print of the KV cache " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . dump_kv_cache = true ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -nkvo " , " --no-kv-offload " } ,
" disable KV offload " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . no_kv_offload = true ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_NO_KV_OFFLOAD " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ctk " , " --cache-type-k " } , " TYPE " ,
2024-12-12 22:53:05 +01:00
string_format (
" KV cache data type for K \n "
" allowed values: %s \n "
" (default: %s) " ,
get_all_kv_cache_types ( ) . c_str ( ) ,
ggml_type_name ( params . cache_type_k )
) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-12-12 22:53:05 +01:00
params . cache_type_k = kv_cache_type_from_str ( value ) ;
2024-09-09 23:36:09 +02:00
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_CACHE_TYPE_K " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ctv " , " --cache-type-v " } , " TYPE " ,
2024-12-12 22:53:05 +01:00
string_format (
" KV cache data type for V \n "
" allowed values: %s \n "
" (default: %s) " ,
get_all_kv_cache_types ( ) . c_str ( ) ,
ggml_type_name ( params . cache_type_v )
) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-12-12 22:53:05 +01:00
params . cache_type_v = kv_cache_type_from_str ( value ) ;
2024-09-09 23:36:09 +02:00
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_CACHE_TYPE_V " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --perplexity " , " --all-logits " } ,
2024-10-12 07:21:51 +02:00
string_format ( " return logits for all tokens in the batch (default: %s) " , params . logits_all ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . logits_all = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --hellaswag " } ,
" compute HellaSwag score over random tasks from datafile supplied with -f " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . hellaswag = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --hellaswag-tasks " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of tasks to use when computing the HellaSwag score (default: %zu) " , params . hellaswag_tasks ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . hellaswag_tasks = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --winogrande " } ,
" compute Winogrande score over random tasks from datafile supplied with -f " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . winogrande = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --winogrande-tasks " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of tasks to use when computing the Winogrande score (default: %zu) " , params . winogrande_tasks ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . winogrande_tasks = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --multiple-choice " } ,
" compute multiple choice score over random tasks from datafile supplied with -f " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . multiple_choice = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --multiple-choice-tasks " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of tasks to use when computing the multiple choice score (default: %zu) " , params . multiple_choice_tasks ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . multiple_choice_tasks = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --kl-divergence " } ,
" computes KL-divergence to logits provided via --kl-divergence-base " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . kl_divergence = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --save-all-logits " , " --kl-divergence-base " } , " FNAME " ,
" set logits file " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . logits_file = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --ppl-stride " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " stride for perplexity calculation (default: %d) " , params . ppl_stride ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . ppl_stride = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --ppl-output-type " } , " <0|1> " ,
2024-10-12 07:21:51 +02:00
string_format ( " output type for perplexity calculation (default: %d) " , params . ppl_output_type ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . ppl_output_type = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PERPLEXITY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -dt " , " --defrag-thold " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " KV cache defragmentation threshold (default: %.1f, < 0 - disabled) " , ( double ) params . defrag_thold ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . defrag_thold = std : : stof ( value ) ;
}
) . set_env ( " LLAMA_ARG_DEFRAG_THOLD " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -np " , " --parallel " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of parallel sequences to decode (default: %d) " , params . n_parallel ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_parallel = value ;
}
2024-09-17 15:35:38 +02:00
) . set_env ( " LLAMA_ARG_N_PARALLEL " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ns " , " --sequences " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of sequences to decode (default: %d) " , params . n_sequences ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_sequences = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PARALLEL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -cb " , " --cont-batching " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable continuous batching (a.k.a dynamic batching) (default: %s) " , params . cont_batching ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . cont_batching = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_CONT_BATCHING " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -nocb " , " --no-cont-batching " } ,
" disable continuous batching " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . cont_batching = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_NO_CONT_BATCHING " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --mmproj " } , " FILE " ,
" path to a multimodal projector file for LLaVA. see examples/llava/README.md " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . mmproj = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_LLAVA } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --image " } , " FILE " ,
" path to an image file. use with multimodal models. Specify multiple times for batching " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . image . emplace_back ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_LLAVA } ) ) ;
2024-10-10 20:14:55 +02:00
if ( llama_supports_rpc ( ) ) {
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-10-10 20:14:55 +02:00
{ " --rpc " } , " SERVERS " ,
" comma separated list of RPC servers " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-10-10 20:14:55 +02:00
params . rpc_servers = value ;
}
) . set_env ( " LLAMA_ARG_RPC " ) ) ;
}
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --mlock " } ,
" force system to keep model in RAM rather than swapping or compressing " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . use_mlock = true ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_MLOCK " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-mmap " } ,
" do not memory-map model (slower load but may reduce pageouts if not using mlock) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . use_mmap = false ;
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_NO_MMAP " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --numa " } , " TYPE " ,
" attempt optimizations that help on some NUMA systems \n "
" - distribute: spread execution evenly over all nodes \n "
" - isolate: only spawn threads on CPUs on the node that execution started on \n "
" - numactl: use the CPU map provided by numactl \n "
" if run without this previously, it is recommended to drop the system page cache before using this \n "
" see https://github.com/ggerganov/llama.cpp/issues/1437 " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " distribute " | | value = = " " ) { params . numa = GGML_NUMA_STRATEGY_DISTRIBUTE ; }
else if ( value = = " isolate " ) { params . numa = GGML_NUMA_STRATEGY_ISOLATE ; }
else if ( value = = " numactl " ) { params . numa = GGML_NUMA_STRATEGY_NUMACTL ; }
else { throw std : : invalid_argument ( " invalid value " ) ; }
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_NUMA " ) ) ;
2024-11-25 19:30:06 +01:00
add_opt ( common_arg (
{ " -dev " , " --device " } , " <dev1,dev2,..> " ,
" comma-separated list of devices to use for offloading (none = don't offload) \n "
" use --list-devices to see a list of available devices " ,
[ ] ( common_params & params , const std : : string & value ) {
params . devices = parse_device_list ( value ) ;
}
) . set_env ( " LLAMA_ARG_DEVICE " ) ) ;
add_opt ( common_arg (
{ " --list-devices " } ,
" print list of available devices and exit " ,
[ ] ( common_params & ) {
printf ( " Available devices: \n " ) ;
for ( size_t i = 0 ; i < ggml_backend_dev_count ( ) ; + + i ) {
auto * dev = ggml_backend_dev_get ( i ) ;
if ( ggml_backend_dev_type ( dev ) = = GGML_BACKEND_DEVICE_TYPE_GPU ) {
size_t free , total ;
ggml_backend_dev_memory ( dev , & free , & total ) ;
printf ( " %s: %s (%zu MiB, %zu MiB free) \n " , ggml_backend_dev_name ( dev ) , ggml_backend_dev_description ( dev ) , total / 1024 / 1024 , free / 1024 / 1024 ) ;
}
}
exit ( 0 ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ngl " , " --gpu-layers " , " --n-gpu-layers " } , " N " ,
" number of layers to store in VRAM " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_gpu_layers = value ;
if ( ! llama_supports_gpu_offload ( ) ) {
2024-11-28 18:15:25 +01:00
fprintf ( stderr , " warning: no usable GPU found, --gpu-layers option will be ignored \n " ) ;
fprintf ( stderr , " warning: one possible reason is that llama.cpp was compiled without GPU support \n " ) ;
fprintf ( stderr , " warning: consult docs/build.md for compilation instructions \n " ) ;
2024-09-09 23:36:09 +02:00
}
}
) . set_env ( " LLAMA_ARG_N_GPU_LAYERS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -sm " , " --split-mode " } , " {none,layer,row} " ,
" how to split the model across multiple GPUs, one of: \n "
" - none: use one GPU only \n "
" - layer (default): split layers and KV across GPUs \n "
" - row: split rows across GPUs " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : string arg_next = value ;
if ( arg_next = = " none " ) {
params . split_mode = LLAMA_SPLIT_MODE_NONE ;
} else if ( arg_next = = " layer " ) {
params . split_mode = LLAMA_SPLIT_MODE_LAYER ;
2024-09-10 22:41:29 +02:00
} else if ( arg_next = = " row " ) {
2024-09-09 23:36:09 +02:00
params . split_mode = LLAMA_SPLIT_MODE_ROW ;
2024-09-10 22:41:29 +02:00
} else {
2024-09-09 23:36:09 +02:00
throw std : : invalid_argument ( " invalid value " ) ;
}
2024-09-10 22:41:29 +02:00
if ( ! llama_supports_gpu_offload ( ) ) {
fprintf ( stderr , " warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect. \n " ) ;
}
2024-09-09 23:36:09 +02:00
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_SPLIT_MODE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ts " , " --tensor-split " } , " N0,N1,N2,... " ,
" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : string arg_next = value ;
// split string by , and /
const std : : regex regex { R " ([,/]+) " } ;
std : : sregex_token_iterator it { arg_next . begin ( ) , arg_next . end ( ) , regex , - 1 } ;
std : : vector < std : : string > split_arg { it , { } } ;
if ( split_arg . size ( ) > = llama_max_devices ( ) ) {
throw std : : invalid_argument (
2024-10-12 07:21:51 +02:00
string_format ( " got %d input configs, but system only has %d devices " , ( int ) split_arg . size ( ) , ( int ) llama_max_devices ( ) )
2024-09-09 23:36:09 +02:00
) ;
}
for ( size_t i = 0 ; i < llama_max_devices ( ) ; + + i ) {
if ( i < split_arg . size ( ) ) {
2024-09-10 22:41:29 +02:00
params . tensor_split [ i ] = std : : stof ( split_arg [ i ] ) ;
2024-09-09 23:36:09 +02:00
} else {
2024-09-10 22:41:29 +02:00
params . tensor_split [ i ] = 0.0f ;
2024-09-09 23:36:09 +02:00
}
}
2024-09-10 22:41:29 +02:00
if ( ! llama_supports_gpu_offload ( ) ) {
fprintf ( stderr , " warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect. \n " ) ;
}
2024-09-09 23:36:09 +02:00
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_TENSOR_SPLIT " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -mg " , " --main-gpu " } , " INDEX " ,
2024-10-12 07:21:51 +02:00
string_format ( " the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d) " , params . main_gpu ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . main_gpu = value ;
2024-09-10 22:41:29 +02:00
if ( ! llama_supports_gpu_offload ( ) ) {
fprintf ( stderr , " warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect. \n " ) ;
}
2024-09-09 23:36:09 +02:00
}
2024-09-25 14:05:13 +02:00
) . set_env ( " LLAMA_ARG_MAIN_GPU " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --check-tensors " } ,
2024-10-12 07:21:51 +02:00
string_format ( " check model tensor data for invalid values (default: %s) " , params . check_tensors ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . check_tensors = true ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --override-kv " } , " KEY=TYPE:VALUE " ,
" advanced option to override model metadata by key. may be specified multiple times. \n "
" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
if ( ! string_parse_kv_override ( value . c_str ( ) , params . kv_overrides ) ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: Invalid type for KV override: %s \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --lora " } , " FNAME " ,
" path to LoRA adapter (can be repeated to use multiple adapters) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2025-01-03 09:18:53 +01:00
params . lora_adapters . push_back ( { std : : string ( value ) , 1.0 , nullptr } ) ;
2024-09-09 23:36:09 +02:00
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
) . set_examples ( { LLAMA_EXAMPLE_COMMON , LLAMA_EXAMPLE_EXPORT_LORA } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --lora-scaled " } , " FNAME " , " SCALE " ,
" path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & fname , const std : : string & scale ) {
2025-01-03 09:18:53 +01:00
params . lora_adapters . push_back ( { fname , std : : stof ( scale ) , nullptr } ) ;
2024-09-09 23:36:09 +02:00
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
) . set_examples ( { LLAMA_EXAMPLE_COMMON , LLAMA_EXAMPLE_EXPORT_LORA } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --control-vector " } , " FNAME " ,
" add a control vector \n note: this argument can be repeated to add multiple control vectors " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . control_vectors . push_back ( { 1.0f , value , } ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --control-vector-scaled " } , " FNAME " , " SCALE " ,
" add a control vector with user defined scaling SCALE \n "
" note: this argument can be repeated to add multiple scaled control vectors " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & fname , const std : : string & scale ) {
2024-09-09 23:36:09 +02:00
params . control_vectors . push_back ( { std : : stof ( scale ) , fname } ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --control-vector-layer-range " } , " START " , " END " ,
" layer range to apply the control vector(s) to, start and end inclusive " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & start , const std : : string & end ) {
2024-09-09 23:36:09 +02:00
params . control_vector_layer_start = std : : stoi ( start ) ;
params . control_vector_layer_end = std : : stoi ( end ) ;
}
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -a " , " --alias " } , " STRING " ,
" set alias for model name (to be used by REST API) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . model_alias = value ;
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_ALIAS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -m " , " --model " } , " FNAME " ,
ex = = LLAMA_EXAMPLE_EXPORT_LORA
? std : : string ( " model path from which to load base model " )
2024-10-12 07:21:51 +02:00
: string_format (
2024-09-09 23:36:09 +02:00
" model path (default: `models/$filename` with filename from `--hf-file` "
" or `--model-url` if set, otherwise %s) " , DEFAULT_MODEL_PATH
) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . model = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_COMMON , LLAMA_EXAMPLE_EXPORT_LORA } ) . set_env ( " LLAMA_ARG_MODEL " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -mu " , " --model-url " } , " MODEL_URL " ,
" model download url (default: unused) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . model_url = value ;
}
) . set_env ( " LLAMA_ARG_MODEL_URL " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2025-01-13 13:56:23 +01:00
{ " -hf " , " -hfr " , " --hf-repo " } , " <user>/<model>[:quant] " ,
" Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. \n "
" example: unsloth/phi-4-GGUF:q4_k_m \n "
" (default: unused) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . hf_repo = value ;
}
) . set_env ( " LLAMA_ARG_HF_REPO " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -hff " , " --hf-file " } , " FILE " ,
2025-01-13 13:56:23 +01:00
" Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . hf_file = value ;
}
) . set_env ( " LLAMA_ARG_HF_FILE " ) ) ;
2024-12-18 18:27:21 +01:00
add_opt ( common_arg (
2025-01-13 13:56:23 +01:00
{ " -hfv " , " -hfrv " , " --hf-repo-v " } , " <user>/<model>[:quant] " ,
2024-12-18 18:27:21 +01:00
" Hugging Face model repository for the vocoder model (default: unused) " ,
[ ] ( common_params & params , const std : : string & value ) {
params . vocoder . hf_repo = value ;
}
) . set_env ( " LLAMA_ARG_HF_REPO_V " ) ) ;
add_opt ( common_arg (
{ " -hffv " , " --hf-file-v " } , " FILE " ,
" Hugging Face model file for the vocoder model (default: unused) " ,
[ ] ( common_params & params , const std : : string & value ) {
params . vocoder . hf_file = value ;
}
) . set_env ( " LLAMA_ARG_HF_FILE_V " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -hft " , " --hf-token " } , " TOKEN " ,
" Hugging Face access token (default: value from HF_TOKEN environment variable) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . hf_token = value ;
}
) . set_env ( " HF_TOKEN " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --context-file " } , " FNAME " ,
" file to load context from (repeat to specify multiple files) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream file ( value , std : : ios : : binary ) ;
if ( ! file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
params . context_files . push_back ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_RETRIEVAL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --chunk-size " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " minimum length of embedded text chunks (default: %d) " , params . chunk_size ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . chunk_size = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_RETRIEVAL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --chunk-separator " } , " STRING " ,
2024-10-12 07:21:51 +02:00
string_format ( " separator between chunks (default: '%s') " , params . chunk_separator . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . chunk_separator = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_RETRIEVAL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --junk " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of times to repeat the junk text (default: %d) " , params . n_junk ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_junk = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PASSKEY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --pos " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " position of the passkey in the junk text (default: %d) " , params . i_pos ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . i_pos = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_PASSKEY } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -o " , " --output " , " --output-file " } , " FNAME " ,
2024-10-12 07:21:51 +02:00
string_format ( " output file (default: '%s') " ,
2024-09-09 23:36:09 +02:00
ex = = LLAMA_EXAMPLE_EXPORT_LORA
? params . lora_outfile . c_str ( )
: ex = = LLAMA_EXAMPLE_CVECTOR_GENERATOR
? params . cvector_outfile . c_str ( )
: params . out_file . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . out_file = value ;
params . cvector_outfile = value ;
params . lora_outfile = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX , LLAMA_EXAMPLE_CVECTOR_GENERATOR , LLAMA_EXAMPLE_EXPORT_LORA } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ofreq " , " --output-frequency " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " output the imatrix every N iterations (default: %d) " , params . n_out_freq ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_out_freq = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --save-frequency " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " save an imatrix copy every N iterations (default: %d) " , params . n_save_freq ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_save_freq = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --process-output " } ,
2024-10-12 07:21:51 +02:00
string_format ( " collect data for the output tensor (default: %s) " , params . process_output ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . process_output = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-ppl " } ,
2024-10-12 07:21:51 +02:00
string_format ( " do not compute perplexity (default: %s) " , params . compute_ppl ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . compute_ppl = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --chunk " , " --from-chunk " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " start processing the input from chunk N (default: %d) " , params . i_chunk ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . i_chunk = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_IMATRIX } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -pps " } ,
2024-10-12 07:21:51 +02:00
string_format ( " is the prompt shared across parallel sequences (default: %s) " , params . is_pp_shared ? " true " : " false " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . is_pp_shared = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_BENCH } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -npp " } , " n0,n1,... " ,
" number of prompt tokens " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
auto p = string_split < int > ( value , ' , ' ) ;
params . n_pp . insert ( params . n_pp . end ( ) , p . begin ( ) , p . end ( ) ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_BENCH } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -ntg " } , " n0,n1,... " ,
" number of text generation tokens " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
auto p = string_split < int > ( value , ' , ' ) ;
params . n_tg . insert ( params . n_tg . end ( ) , p . begin ( ) , p . end ( ) ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_BENCH } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -npl " } , " n0,n1,... " ,
" number of parallel prompts " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
auto p = string_split < int > ( value , ' , ' ) ;
params . n_pl . insert ( params . n_pl . end ( ) , p . begin ( ) , p . end ( ) ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_BENCH } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --embd-normalize " } , " N " ,
2024-10-22 09:40:02 +02:00
string_format ( " normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) " , params . embd_normalize ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . embd_normalize = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_EMBEDDING } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --embd-output-format " } , " FORMAT " ,
" empty = default, \" array \" = [[],[]...], \" json \" = openai style, \" json+ \" = same \" json \" + cosine similarity matrix " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . embd_out = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_EMBEDDING } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --embd-separator " } , " STRING " ,
2024-10-22 09:40:02 +02:00
" separator of embeddings (default \\ n) for example \" <#sep#> \" " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . embd_sep = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_EMBEDDING } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --host " } , " HOST " ,
2024-10-12 07:21:51 +02:00
string_format ( " ip address to listen (default: %s) " , params . hostname . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . hostname = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_HOST " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --port " } , " PORT " ,
2024-10-12 07:21:51 +02:00
string_format ( " port to listen (default: %d) " , params . port ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . port = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_PORT " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --path " } , " PATH " ,
2024-10-12 07:21:51 +02:00
string_format ( " path to serve static files from (default: %s) " , params . public_path . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . public_path = value ;
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_STATIC_PATH " ) ) ;
2024-12-10 18:22:34 +01:00
add_opt ( common_arg (
{ " --no-webui " } ,
string_format ( " Disable the Web UI (default: %s) " , params . webui ? " enabled " : " disabled " ) ,
[ ] ( common_params & params ) {
params . webui = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_NO_WEBUI " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --embedding " , " --embeddings " } ,
2024-10-12 07:21:51 +02:00
string_format ( " restrict to only support embedding use case; use only with dedicated embedding models (default: %s) " , params . embedding ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . embedding = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_EMBEDDINGS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-28 16:42:03 +02:00
{ " --reranking " , " --rerank " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable reranking endpoint on server (default: %s) " , params . reranking ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-28 16:42:03 +02:00
params . reranking = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_RERANKING " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --api-key " } , " KEY " ,
" API key to use for authentication (default: none) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . api_keys . push_back ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_API_KEY " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --api-key-file " } , " FNAME " ,
" path to file containing API keys (default: none) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
std : : ifstream key_file ( value ) ;
if ( ! key_file ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format ( " error: failed to open file '%s' \n " , value . c_str ( ) ) ) ;
2024-09-09 23:36:09 +02:00
}
std : : string key ;
while ( std : : getline ( key_file , key ) ) {
if ( ! key . empty ( ) ) {
params . api_keys . push_back ( key ) ;
}
}
key_file . close ( ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --ssl-key-file " } , " FNAME " ,
" path to file a PEM-encoded SSL private key " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . ssl_file_key = value ;
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_SSL_KEY_FILE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --ssl-cert-file " } , " FNAME " ,
" path to file a PEM-encoded SSL certificate " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . ssl_file_cert = value ;
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_SSL_CERT_FILE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -to " , " --timeout " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " server read/write timeout in seconds (default: %d) " , params . timeout_read ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . timeout_read = value ;
params . timeout_write = value ;
}
2024-09-25 14:05:13 +02:00
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_TIMEOUT " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --threads-http " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of threads used to process HTTP requests (default: %d) " , params . n_threads_http ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_threads_http = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_THREADS_HTTP " ) ) ;
2024-10-13 17:52:48 +02:00
add_opt ( common_arg (
{ " --cache-reuse " } , " N " ,
string_format ( " min chunk size to attempt reusing from the cache via KV shifting (default: %d) " , params . n_cache_reuse ) ,
[ ] ( common_params & params , int value ) {
params . n_cache_reuse = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_CACHE_REUSE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --metrics " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable prometheus compatible metrics endpoint (default: %s) " , params . endpoint_metrics ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . endpoint_metrics = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_ENDPOINT_METRICS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-10-08 13:27:04 +02:00
{ " --slots " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable slots monitoring endpoint (default: %s) " , params . endpoint_slots ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-10-08 13:27:04 +02:00
params . endpoint_slots = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_ENDPOINT_SLOTS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-10-08 13:27:04 +02:00
{ " --props " } ,
2024-10-12 07:21:51 +02:00
string_format ( " enable changing global properties via POST /props (default: %s) " , params . endpoint_props ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-10-08 13:27:04 +02:00
params . endpoint_props = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_ENDPOINT_PROPS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --no-slots " } ,
2024-10-08 13:27:04 +02:00
" disables slots monitoring endpoint " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . endpoint_slots = false ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_NO_ENDPOINT_SLOTS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --slot-save-path " } , " PATH " ,
" path to save slot kv cache (default: disabled) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . slot_save_path = value ;
// if doesn't end with DIRECTORY_SEPARATOR, add it
if ( ! params . slot_save_path . empty ( ) & & params . slot_save_path [ params . slot_save_path . size ( ) - 1 ] ! = DIRECTORY_SEPARATOR ) {
params . slot_save_path + = DIRECTORY_SEPARATOR ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --chat-template " } , " JINJA_TEMPLATE " ,
2024-12-02 22:10:19 +01:00
string_format (
" set custom jinja chat template (default: template taken from model's metadata) \n "
" if suffix/prefix are specified, template will be disabled \n "
" list of built-in templates: \n %s " , list_builtin_chat_templates ( ) . c_str ( )
) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
if ( ! common_chat_verify_template ( value ) ) {
2024-10-12 07:21:51 +02:00
throw std : : runtime_error ( string_format (
2024-09-09 23:36:09 +02:00
" error: the supplied chat template is not supported: %s \n "
" note: llama.cpp does not use jinja parser, we only support commonly used templates \n " ,
value . c_str ( )
) ) ;
}
params . chat_template = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_CHAT_TEMPLATE " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " -sps " , " --slot-prompt-similarity " } , " SIMILARITY " ,
2024-10-12 07:21:51 +02:00
string_format ( " how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled) \n " , params . slot_prompt_similarity ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . slot_prompt_similarity = std : : stof ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --lora-init-without-apply " } ,
2024-10-12 07:21:51 +02:00
string_format ( " load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s) " , params . lora_init_without_apply ? " enabled " : " disabled " ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . lora_init_without_apply = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_SERVER } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --simple-io " } ,
" use basic IO for better compatibility in subprocesses and limited consoles " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-09 23:36:09 +02:00
params . simple_io = true ;
}
) . set_examples ( { LLAMA_EXAMPLE_MAIN , LLAMA_EXAMPLE_INFILL } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --positive-file " } , " FNAME " ,
2024-10-12 07:21:51 +02:00
string_format ( " positive prompts file, one prompt per line (default: '%s') " , params . cvector_positive_file . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . cvector_positive_file = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_CVECTOR_GENERATOR } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --negative-file " } , " FNAME " ,
2024-10-12 07:21:51 +02:00
string_format ( " negative prompts file, one prompt per line (default: '%s') " , params . cvector_negative_file . c_str ( ) ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
params . cvector_negative_file = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_CVECTOR_GENERATOR } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --pca-batch " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d) " , params . n_pca_batch ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_pca_batch = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_CVECTOR_GENERATOR } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --pca-iter " } , " N " ,
2024-10-12 07:21:51 +02:00
string_format ( " number of iterations used for PCA (default: %d) " , params . n_pca_iterations ) ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-09 23:36:09 +02:00
params . n_pca_iterations = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_CVECTOR_GENERATOR } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --method " } , " {pca, mean} " ,
" dimensionality reduction method to be used (default: pca) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " pca " ) { params . cvector_dimre_method = DIMRE_METHOD_PCA ; }
else if ( value = = " mean " ) { params . cvector_dimre_method = DIMRE_METHOD_MEAN ; }
else { throw std : : invalid_argument ( " invalid value " ) ; }
}
) . set_examples ( { LLAMA_EXAMPLE_CVECTOR_GENERATOR } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --output-format " } , " {md,jsonl} " ,
" output format for batched-bench results (default: md) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , const std : : string & value ) {
2024-09-09 23:36:09 +02:00
/**/ if ( value = = " jsonl " ) { params . batched_bench_output_jsonl = true ; }
else if ( value = = " md " ) { params . batched_bench_output_jsonl = false ; }
else { std : : invalid_argument ( " invalid value " ) ; }
}
) . set_examples ( { LLAMA_EXAMPLE_BENCH } ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-09 23:36:09 +02:00
{ " --log-disable " } ,
" Log disable " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & ) {
common_log_pause ( common_log_main ( ) ) ;
2024-09-15 19:46:12 +02:00
}
2024-09-09 23:36:09 +02:00
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " --log-file " } , " FNAME " ,
" Log to file " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & , const std : : string & value ) {
common_log_set_file ( common_log_main ( ) , value . c_str ( ) ) ;
2024-09-15 19:46:12 +02:00
}
2024-09-09 23:36:09 +02:00
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " --log-colors " } ,
" Enable colored logging " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & ) {
common_log_set_colors ( common_log_main ( ) , true ) ;
2024-09-15 19:46:12 +02:00
}
) . set_env ( " LLAMA_LOG_COLORS " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " -v " , " --verbose " , " --log-verbose " } ,
" Set verbosity level to infinity (i.e. log all messages, useful for debugging) " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params ) {
2024-09-15 19:46:12 +02:00
params . verbosity = INT_MAX ;
2024-10-10 22:57:42 +02:00
common_log_set_verbosity_thold ( INT_MAX ) ;
2024-09-15 19:46:12 +02:00
}
2024-09-09 23:36:09 +02:00
) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " -lv " , " --verbosity " , " --log-verbosity " } , " N " ,
" Set the verbosity threshold. Messages with a higher verbosity will be ignored. " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & params , int value ) {
2024-09-15 19:46:12 +02:00
params . verbosity = value ;
2024-10-10 22:57:42 +02:00
common_log_set_verbosity_thold ( value ) ;
2024-09-15 19:46:12 +02:00
}
) . set_env ( " LLAMA_LOG_VERBOSITY " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " --log-prefix " } ,
" Enable prefx in log messages " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & ) {
common_log_set_prefix ( common_log_main ( ) , true ) ;
2024-09-15 19:46:12 +02:00
}
) . set_env ( " LLAMA_LOG_PREFIX " ) ) ;
2024-10-10 22:57:42 +02:00
add_opt ( common_arg (
2024-09-15 19:46:12 +02:00
{ " --log-timestamps " } ,
" Enable timestamps in log messages " ,
2024-10-10 22:57:42 +02:00
[ ] ( common_params & ) {
common_log_set_timestamps ( common_log_main ( ) , true ) ;
2024-09-15 19:46:12 +02:00
}
) . set_env ( " LLAMA_LOG_TIMESTAMPS " ) ) ;
2024-09-09 23:36:09 +02:00
2024-11-25 08:58:41 +01:00
// speculative parameters
add_opt ( common_arg (
{ " -td " , " --threads-draft " } , " N " ,
" number of threads to use during generation (default: same as --threads) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams . n_threads = value ;
if ( params . speculative . cpuparams . n_threads < = 0 ) {
params . speculative . cpuparams . n_threads = std : : thread : : hardware_concurrency ( ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " -tbd " , " --threads-batch-draft " } , " N " ,
" number of threads to use during batch and prompt processing (default: same as --threads-draft) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams_batch . n_threads = value ;
if ( params . speculative . cpuparams_batch . n_threads < = 0 ) {
params . speculative . cpuparams_batch . n_threads = std : : thread : : hardware_concurrency ( ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " -Cd " , " --cpu-mask-draft " } , " M " ,
" Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) " ,
[ ] ( common_params & params , const std : : string & mask ) {
params . speculative . cpuparams . mask_valid = true ;
if ( ! parse_cpu_mask ( mask , params . speculative . cpuparams . cpumask ) ) {
throw std : : invalid_argument ( " invalid cpumask " ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " -Crd " , " --cpu-range-draft " } , " lo-hi " ,
" Ranges of CPUs for affinity. Complements --cpu-mask-draft " ,
[ ] ( common_params & params , const std : : string & range ) {
params . speculative . cpuparams . mask_valid = true ;
if ( ! parse_cpu_range ( range , params . speculative . cpuparams . cpumask ) ) {
throw std : : invalid_argument ( " invalid range " ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --cpu-strict-draft " } , " <0|1> " ,
" Use strict CPU placement for draft model (default: same as --cpu-strict) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams . strict_cpu = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --prio-draft " } , " N " ,
string_format ( " set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d) \n " , params . speculative . cpuparams . priority ) ,
[ ] ( common_params & params , int prio ) {
if ( prio < 0 | | prio > 3 ) {
throw std : : invalid_argument ( " invalid value " ) ;
}
params . speculative . cpuparams . priority = ( enum ggml_sched_priority ) prio ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --poll-draft " } , " <0|1> " ,
" Use polling to wait for draft model work (default: same as --poll]) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams . poll = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " -Cbd " , " --cpu-mask-batch-draft " } , " M " ,
" Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) " ,
[ ] ( common_params & params , const std : : string & mask ) {
params . speculative . cpuparams_batch . mask_valid = true ;
if ( ! parse_cpu_mask ( mask , params . speculative . cpuparams_batch . cpumask ) ) {
throw std : : invalid_argument ( " invalid cpumask " ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " -Crbd " , " --cpu-range-batch-draft " } , " lo-hi " ,
" Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch) " ,
[ ] ( common_params & params , const std : : string & range ) {
params . speculative . cpuparams_batch . mask_valid = true ;
if ( ! parse_cpu_range ( range , params . speculative . cpuparams_batch . cpumask ) ) {
throw std : : invalid_argument ( " invalid cpumask " ) ;
}
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --cpu-strict-batch-draft " } , " <0|1> " ,
" Use strict CPU placement for draft model (default: --cpu-strict-draft) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams_batch . strict_cpu = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --prio-batch-draft " } , " N " ,
string_format ( " set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d) \n " , params . speculative . cpuparams_batch . priority ) ,
[ ] ( common_params & params , int prio ) {
if ( prio < 0 | | prio > 3 ) {
throw std : : invalid_argument ( " invalid value " ) ;
}
params . speculative . cpuparams_batch . priority = ( enum ggml_sched_priority ) prio ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --poll-batch-draft " } , " <0|1> " ,
" Use polling to wait for draft model work (default: --poll-draft) " ,
[ ] ( common_params & params , int value ) {
params . speculative . cpuparams_batch . poll = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) ) ;
add_opt ( common_arg (
{ " --draft-max " , " --draft " , " --draft-n " } , " N " ,
string_format ( " number of tokens to draft for speculative decoding (default: %d) " , params . speculative . n_max ) ,
[ ] ( common_params & params , int value ) {
params . speculative . n_max = value ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_LOOKUP , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_DRAFT_MAX " ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " --draft-min " , " --draft-n-min " } , " N " ,
string_format ( " minimum number of draft tokens to use for speculative decoding (default: %d) " , params . speculative . n_min ) ,
[ ] ( common_params & params , int value ) {
params . speculative . n_min = value ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_LOOKUP , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_DRAFT_MIN " ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " --draft-p-split " } , " P " ,
string_format ( " speculative decoding split probability (default: %.1f) " , ( double ) params . speculative . p_split ) ,
[ ] ( common_params & params , const std : : string & value ) {
params . speculative . p_split = std : : stof ( value ) ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE } ) . set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " --draft-p-min " } , " P " ,
string_format ( " minimum speculative decoding probability (greedy) (default: %.1f) " , ( double ) params . speculative . p_min ) ,
[ ] ( common_params & params , const std : : string & value ) {
params . speculative . p_min = std : : stof ( value ) ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " -cd " , " --ctx-size-draft " } , " N " ,
string_format ( " size of the prompt context for the draft model (default: %d, 0 = loaded from model) " , params . speculative . n_ctx ) ,
[ ] ( common_params & params , int value ) {
params . speculative . n_ctx = value ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) ) ;
2024-11-25 19:30:06 +01:00
add_opt ( common_arg (
{ " -devd " , " --device-draft " } , " <dev1,dev2,..> " ,
" comma-separated list of devices to use for offloading the draft model (none = don't offload) \n "
" use --list-devices to see a list of available devices " ,
[ ] ( common_params & params , const std : : string & value ) {
params . speculative . devices = parse_device_list ( value ) ;
}
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_SERVER } ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " -ngld " , " --gpu-layers-draft " , " --n-gpu-layers-draft " } , " N " ,
" number of layers to store in VRAM for the draft model " ,
[ ] ( common_params & params , int value ) {
params . speculative . n_gpu_layers = value ;
if ( ! llama_supports_gpu_offload ( ) ) {
2024-11-28 18:15:25 +01:00
fprintf ( stderr , " warning: no usable GPU found, --gpu-layers-draft option will be ignored \n " ) ;
fprintf ( stderr , " warning: one possible reason is that llama.cpp was compiled without GPU support \n " ) ;
fprintf ( stderr , " warning: consult docs/build.md for compilation instructions \n " ) ;
2024-11-25 08:58:41 +01:00
}
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) ) ;
2024-11-25 08:58:41 +01:00
add_opt ( common_arg (
{ " -md " , " --model-draft " } , " FNAME " ,
" draft model for speculative decoding (default: unused) " ,
[ ] ( common_params & params , const std : : string & value ) {
params . speculative . model = value ;
}
2024-12-12 16:57:32 +01:00
) . set_examples ( { LLAMA_EXAMPLE_SPECULATIVE , LLAMA_EXAMPLE_SERVER } ) . set_env ( " LLAMA_ARG_MODEL_DRAFT " ) ) ;
2024-11-25 08:58:41 +01:00
2024-12-18 18:27:21 +01:00
add_opt ( common_arg (
{ " -mv " , " --model-vocoder " } , " FNAME " ,
" vocoder model for audio generation (default: unused) " ,
[ ] ( common_params & params , const std : : string & value ) {
params . vocoder . model = value ;
}
) . set_examples ( { LLAMA_EXAMPLE_TTS , LLAMA_EXAMPLE_SERVER } ) ) ;
2024-12-19 16:35:15 +01:00
// model-specific
add_opt ( common_arg (
{ " --tts-oute-default " } ,
string_format ( " use default OuteTTS models (note: can download weights from the internet) " ) ,
[ ] ( common_params & params ) {
params . hf_repo = " OuteAI/OuteTTS-0.2-500M-GGUF " ;
params . hf_file = " OuteTTS-0.2-500M-Q8_0.gguf " ;
params . vocoder . hf_repo = " ggml-org/WavTokenizer " ;
params . vocoder . hf_file = " WavTokenizer-Large-75-F16.gguf " ;
}
) . set_examples ( { LLAMA_EXAMPLE_TTS } ) ) ;
2024-09-09 23:36:09 +02:00
return ctx_arg ;
}