diff --git a/examples/common.cpp b/examples/common.cpp index 259880a7c..a6abc4977 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -321,12 +321,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "--n-parts") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parts = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { gpt_print_usage(argc, argv, default_params); exit(0); @@ -418,7 +412,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stderr, " --n-parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); @@ -473,7 +466,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; - lparams.n_parts = params.n_parts; lparams.n_gpu_layers = params.n_gpu_layers; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; diff --git a/examples/common.h b/examples/common.h index f4e07a252..2ad20ba50 100644 --- a/examples/common.h +++ b/examples/common.h @@ -24,7 +24,6 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); int32_t n_predict = -1; // new tokens to predict - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 9a2aa7c64..085fdde3c 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -321,7 +321,6 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = 256; - lparams.n_parts = 1; lparams.seed = 1; lparams.f16_kv = false; lparams.use_mlock = false; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 355969579..91f04b6c7 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -26,7 +26,6 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; - lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/llama.cpp b/llama.cpp index 98f49abd7..6e19064fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -812,7 +812,6 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, - /*.n_parts =*/ -1, /*.gpu_layers =*/ 0, /*.seed =*/ -1, /*.f16_kv =*/ false, diff --git a/llama.h b/llama.h index 21cba8cf6..f955fa23d 100644 --- a/llama.h +++ b/llama.h @@ -55,7 +55,6 @@ extern "C" { struct llama_context_params { int n_ctx; // text context - int n_parts; // -1 for default int n_gpu_layers; // number of layers to store in VRAM int seed; // RNG seed, -1 for random