From 606873401c9bb8ca49a8a5f34d414807deacba46 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 18 Feb 2024 20:59:26 +0100 Subject: [PATCH] rename n_ctx to kv_size --- README.md | 4 +- common/common.cpp | 18 +- common/common.h | 2 +- examples/Miku.sh | 4 +- examples/alpaca.sh | 2 +- examples/baby-llama/baby-llama.cpp | 42 ++--- examples/batched-bench/batched-bench.cpp | 2 +- examples/batched.swift/Sources/main.swift | 10 +- examples/batched/README.md | 2 +- examples/batched/batched.cpp | 12 +- examples/beam-search/beam-search.cpp | 4 +- examples/benchmark/benchmark-matmult.cpp | 22 +-- examples/chat-13B.bat | 2 +- examples/chat-13B.sh | 4 +- examples/chat-persistent.sh | 14 +- examples/chat-vicuna.sh | 4 +- .../convert-llama2c-to-ggml.cpp | 8 +- examples/embedding/embedding.cpp | 8 +- examples/finetune/finetune.cpp | 24 +-- examples/gpt4all.sh | 2 +- examples/imatrix/imatrix.cpp | 36 ++-- examples/infill/README.md | 3 +- examples/infill/infill.cpp | 30 +-- examples/llama-bench/llama-bench.cpp | 2 +- .../llama.cpp.swift/LibLlama.swift | 12 +- examples/llama2-13b.sh | 2 +- examples/llama2.sh | 2 +- examples/llava/llava-cli.cpp | 2 +- examples/llava/llava.cpp | 8 +- examples/lookahead/lookahead.cpp | 6 +- examples/lookup/lookup.cpp | 6 +- examples/main/README.md | 17 +- examples/main/main.cpp | 38 ++-- examples/parallel/parallel.cpp | 4 +- examples/passkey/passkey.cpp | 18 +- examples/perplexity/perplexity.cpp | 120 ++++++------ examples/quantize-stats/quantize-stats.cpp | 2 +- examples/server-llama2-13B.sh | 2 +- examples/server/README.md | 8 +- examples/server/server.cpp | 48 ++--- examples/simple/README.md | 2 +- examples/simple/simple.cpp | 12 +- examples/speculative/speculative.cpp | 6 +- .../train-text-from-scratch.cpp | 24 +-- llama.cpp | 176 +++++++++--------- llama.h | 4 +- scripts/run-with-preset.py | 4 +- tests/test-backend-ops.cpp | 12 +- 48 files changed, 403 insertions(+), 393 deletions(-) diff --git a/README.md b/README.md index 8c7bc2689..c2de7d77d 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 -llm_load_print_meta: n_ctx = 512 +llm_load_print_meta: kv_size = 512 llm_load_print_meta: n_embd = 5120 llm_load_print_meta: n_head = 40 llm_load_print_meta: n_head_kv = 40 @@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size = 75.41 MB system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 -generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0 +generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0 Building a website can be done in 10 simple steps: diff --git a/common/common.cpp b/common/common.cpp index 10ef11829..cf2ef5ade 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } sparams.top_k = std::stoi(argv[i]); } else if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.kv_size = std::stoi(argv[i]); + fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead"); + } else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") { if (++i >= argc) { invalid_param = true; break; } - params.n_ctx = std::stoi(argv[i]); + params.kv_size = std::stoi(argv[i]); } else if (arg == "--grp-attn-n" || arg == "-gan") { if (++i >= argc) { invalid_param = true; @@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -bf FNAME, --binary-file FNAME\n"); printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); + printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); printf(" (default: %s)\n", sampler_type_names.c_str()); @@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); @@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); - cparams.n_ctx = params.n_ctx; + cparams.kv_size = params.kv_size; cparams.n_batch = params.n_batch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); - fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); diff --git a/common/common.h b/common/common.h index 935771d44..b0f81ac3c 100644 --- a/common/common.h +++ b/common/common.h @@ -50,7 +50,7 @@ struct gpt_params { int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size + int32_t kv_size = 512; // KV Cache size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 8; // number of tokens to draft during speculative decoding diff --git a/examples/Miku.sh b/examples/Miku.sh index b9174b4e6..c8260ff36 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}" # Uncomment and adjust to the number of CPU cores you want to use. #N_THREAD="${N_THREAD:-4}" -CTX_SIZE="${CTX_SIZE:-4096}" +KV_SIZE="${KV_SIZE:-4096}" N_PREDICTS="${N_PREDICTS:-4096}" GEN_OPTIONS=(--batch_size 1024 ---ctx_size "$CTX_SIZE" +--kv_size "$KV_SIZE" --keep -1 --repeat_last_n 256 --repeat_penalty 1.17647 diff --git a/examples/alpaca.sh b/examples/alpaca.sh index 8d2bae691..6548c6c49 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -10,7 +10,7 @@ cd .. ./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \ --color \ -f ./prompts/alpaca.txt \ - --ctx_size 2048 \ + --kv_size 2048 \ -n -1 \ -ins -b 256 \ --top_k 10000 \ diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index e7d2ad592..28b6f898d 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -532,16 +532,16 @@ static struct ggml_tensor * forward( // Vcur shape [n_embd, N, 1, 1] struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // kv_self.k shape [n_embd * kv_size * n_layer, 1] + // kv_self.v shape [n_embd * kv_size * n_layer, 1] // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + ( kv_size)*ggml_element_size(kv_self.v), + (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -560,7 +560,7 @@ static struct ggml_tensor * forward( Qcur, 0, 2, 1, 3); - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.k shape [n_embd * kv_size * n_layer, 1] // K shape [n_embd/n_head, n_past + N, n_head, 1] struct ggml_tensor * K = ggml_permute(ctx0, @@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch( assert_shape_3d(Vcur, N, n_embd, n_batch); - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // kv_self.k shape [n_embd * kv_size * n_batch * n_layer] + // kv_self.v shape [kv_size * n_embd * n_batch * n_layer] // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + ( kv_size)*ggml_element_size(kv_self.v), + (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch( 0, 2, 1, 3); assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // kv_self.k shape [n_embd * kv_size * n_batch * n_layer] // K shape [n_embd/n_head, n_past + N, n_head, n_batch] struct ggml_tensor * K = ggml_permute(ctx0, @@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch( assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); // split cached V into n_head heads - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // kv_self.v shape [kv_size * n_embd * n_batch * n_layer] // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] struct ggml_tensor * V = ggml_view_4d(ctx0, vc, @@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora( cur)), n_embd, N))); - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // kv_self.k shape [n_embd * kv_size * n_layer, 1] + // kv_self.v shape [n_embd * kv_size * n_layer, 1] // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + ( kv_size)*ggml_element_size(kv_self.v), + (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora( Qcur, 0, 2, 1, 3); - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.k shape [n_embd * kv_size * n_layer, 1] // K shape [n_embd/n_head, n_past + N, n_head, 1] struct ggml_tensor * K = ggml_permute(ctx0, @@ -1470,7 +1470,7 @@ int main(int argc, char ** argv) { /* struct llama_model_lora model_lora; // model.hparams.n_vocab = 6; - // model.hparams.n_ctx = 64; + // model.hparams.kv_size = 64; // model.hparams.n_embd = 128; // model.hparams.n_mult = 2; // model.hparams.n_head = 8; @@ -1478,7 +1478,7 @@ int main(int argc, char ** argv) { // model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head; model_lora.hparams.n_vocab = 16; - model_lora.hparams.n_ctx = 32; + model_lora.hparams.kv_size = 32; model_lora.hparams.n_embd = 256; model_lora.hparams.n_mult = 2; model_lora.hparams.n_head = 16; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b4b8a38e1..cd1bd9dd5 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -104,7 +104,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_max; + ctx_params.kv_size = n_kv_max; ctx_params.n_batch = 512; ctx_params.mul_mat_q = mmq; diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index d75c503d5..ddc0973b0 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par var context_params = llama_context_default_params() context_params.seed = 1234 -context_params.n_ctx = n_kv_req +context_params.kv_size = n_kv_req context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 context_params.n_threads_batch = 8 @@ -53,12 +53,12 @@ defer { llama_free(context) } -let n_ctx = llama_n_ctx(context) +let kv_size = llama_kv_size(context) -print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") +print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") -if n_kv_req > n_ctx { - print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req) +if n_kv_req > kv_size { + print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req) exit(1) } diff --git a/examples/batched/README.md b/examples/batched/README.md index 5d7303317..73e5f8943 100644 --- a/examples/batched/README.md +++ b/examples/batched/README.md @@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt ... -main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113 +main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113 Hello my name is diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 9be7eb56b..ed54f94d7 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -78,7 +78,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_req; + ctx_params.kv_size = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -90,14 +90,14 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + if (n_kv_req > kv_size) { + LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_TEE("%s: either reduce n_parallel or increase kv_size\n", __func__); return 1; } diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 866c6d7a6..fc7bbbb30 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -139,8 +139,8 @@ int main(int argc, char ** argv) std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); - const size_t max_context_size = llama_n_ctx( ctx ); - const size_t max_tokens_list_size = max_context_size - 4 ; + const size_t max_kv_size = llama_kv_size(ctx); + const size_t max_tokens_list_size = max_kv_size - 4 ; if (tokens_list.size() > max_tokens_list_size) { diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e89f3de2f..79fb5a94d 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -128,20 +128,20 @@ int main(int argc, char ** argv) { // TODO: perform the bench for all types or for a user specified type const ggml_type qtype = GGML_TYPE_Q4_1; - size_t ctx_size = 0; - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += 1024*1024*16; + size_t kv_size = 0; + kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); + kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); + kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizez); + kv_size += ggml_row_size(qtype, sizex * sizey); + kv_size += ggml_row_size(qtype, sizex * sizey); + kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS + kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS + kv_size += 1024 * 1024 * 16; - printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); + printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size / 1024 / 1024)); struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ kv_size, /*.mem_buffer =*/ NULL, /* no_alloc =*/ 0 }; diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat index c5c8ac6ef..49c1e5039 100644 --- a/examples/chat-13B.bat +++ b/examples/chat-13B.bat @@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use. rem if not defined N_THREAD set "N_THREAD=8" rem Number of tokens to predict (made it larger than default because we want a long interaction) if not defined N_PREDICTS set "N_PREDICTS=2048" -if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647" +if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647" rem Default main script paths set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe" diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh index 35c089d57..92070ffa7 100755 --- a/examples/chat-13B.sh +++ b/examples/chat-13B.sh @@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}" N_PREDICTS="${N_PREDICTS:-2048}" # Note: you can also override the generation options by specifying them on the command line: -# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 -GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" +# For example, override the context size by doing: ./chatLLaMa --kv_size 1024 +GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" DATE_TIME=$(date +%H:%M) DATE_YEAR=$(date +%Y) diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index 22f5b83d3..694390522 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -27,9 +27,9 @@ SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+ SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" -CTX_SIZE=2048 -CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW -OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@") +KV_SIZE=2048 +KV_ROTATE_POINT=$((KV_SIZE * 3 / 5)) # REVIEW +OPTS=(--model "$MODEL" --kv_size "$KV_SIZE" --repeat_last_n 256 "$@") # An unbuffered `tail -c+N` skip_bytes() { @@ -84,7 +84,7 @@ n_tokens=0 while read -e line; do # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input - n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32)) + n_predict=$((KV_SIZE - n_tokens - ${#line} / 2 - 32)) # Swap prompts when we're about to run out of context if ((n_predict <= 0)); then @@ -97,11 +97,11 @@ while read -e line; do cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE" n_tokens=0 - n_predict=$((CTX_SIZE / 2)) + n_predict=$((KV_SIZE / 2)) fi echo " ${line}" >>"$CUR_PROMPT_FILE" - if ((n_tokens > CTX_ROTATE_POINT)); then + if ((n_tokens > KV_ROTATE_POINT)); then echo " ${line}" >>"$NEXT_PROMPT_FILE" fi @@ -139,7 +139,7 @@ while read -e line; do n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) - if ((n_tokens > CTX_ROTATE_POINT)); then + if ((n_tokens > KV_ROTATE_POINT)); then tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" fi diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh index 8c7b7bef4..4fc0893d8 100755 --- a/examples/chat-vicuna.sh +++ b/examples/chat-vicuna.sh @@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}" N_PREDICTS="${N_PREDICTS:-2048}" # Note: you can also override the generation options by specifying them on the command line: -# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 -GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" +# For example, override the context size by doing: ./chatLLaMa --kv_size 1024 +GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" DATE_TIME=$(date +%H:%M) DATE_YEAR=$(date +%Y) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8209dcb64..42d4fe327 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -226,7 +226,7 @@ struct llama_vocab { struct my_llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? + uint32_t kv_size = 512; // this is provided as user input? uint32_t n_embd = 4096; uint32_t n_ff = 11008; uint32_t n_mult = 4; @@ -326,7 +326,7 @@ struct train_params { static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: kv_size: %u\n", __func__, params->kv_size); printf("%s: n_embd: %u\n", __func__, params->n_embd); printf("%s: n_mult: %u\n", __func__, params->n_mult); printf("%s: n_head: %u\n", __func__, params->n_head); @@ -732,7 +732,7 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); - gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); + gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.kv_size); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); @@ -937,7 +937,7 @@ int main(int argc, char ** argv) { struct my_llama_model model; model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; + model.hparams.kv_size = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_ff = config.hidden_dim; model.hparams.n_mult = 32;//params.n_mult; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index acff715e9..b0b59b522 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -88,11 +88,11 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); - if (n_ctx > n_ctx_train) { + if (kv_size > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + __func__, n_ctx_train, kv_size); } // print system information @@ -106,7 +106,7 @@ int main(int argc, char ** argv) { // max batch size const uint64_t n_batch = params.n_batch; - GGML_ASSERT(params.n_batch == params.n_ctx); + GGML_ASSERT(params.n_batch == params.kv_size); // tokenize the prompts and trim std::vector> inputs; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 98bf5a07a..e3fdb3546 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -16,7 +16,7 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; + uint32_t kv_size = 512; uint32_t n_embd = 4096; uint32_t n_ff = 11008; uint32_t n_head = 32; @@ -190,7 +190,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab : %u\n", __func__, params->n_vocab); - printf("%s: n_ctx : %u\n", __func__, params->n_ctx); + printf("%s: kv_size : %u\n", __func__, params->kv_size); printf("%s: n_embd : %u\n", __func__, params->n_embd); printf("%s: n_ff : %u\n", __func__, params->n_ff); printf("%s: n_head : %u\n", __func__, params->n_head); @@ -250,7 +250,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h }; GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams->kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); @@ -268,7 +268,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h } } -static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { +static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t kv_size) { auto & hparams = model->hparams; std::vector tn_buf; @@ -298,7 +298,7 @@ static void init_model(struct llama_model * input, struct my_llama_model * model gguf_free(mctx); } hparams.n_vocab = llama_n_vocab(input); - hparams.n_ctx = n_ctx; + hparams.kv_size = kv_size; // get tensors from llama_model (possibly mmapped) model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); @@ -529,7 +529,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_past = 0; const int N = n_tokens; const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; + const int kv_size = hparams.kv_size; const int n_vocab = hparams.n_vocab; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -558,13 +558,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale] (struct ggml_tensor * t) -> struct ggml_tensor * { // not capturing these, to silcence warnings const int rope_mode = 0; return ggml_rope_custom(ctx, - t, KQ_pos, n_rot, rope_mode, n_ctx, 0, + t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ); }; @@ -848,7 +848,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size); gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); @@ -1554,9 +1554,9 @@ int main(int argc, char ** argv) { bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); if (existed) { - // overwrite last n_ctx with user provided n_ctx + // overwrite last kv_size with user provided kv_size if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; + model.hparams.kv_size = params.common.n_ctx; } const bool opt_param_count_changed = ( @@ -1625,7 +1625,7 @@ int main(int argc, char ** argv) { printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); printf("%s: opt iter %d\n", __func__, opt->iter); - int n_tokens = model.hparams.n_ctx; + int n_tokens = model.hparams.kv_size; int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh index 5fd739e55..54bc1188d 100755 --- a/examples/gpt4all.sh +++ b/examples/gpt4all.sh @@ -10,6 +10,6 @@ cd .. ./main --color --instruct --threads 4 \ --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ --file ./prompts/alpaca.txt \ - --batch_size 8 --ctx_size 2048 -n -1 \ + --batch_size 8 --kv_size 2048 -n -1 \ --repeat_last_n 64 --repeat_penalty 1.3 \ --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95 diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f21bc48f3..7ecb8625e 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -325,7 +325,7 @@ static void process_logits( static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); @@ -336,17 +336,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (from_chunk > 0) { - if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { + if (size_t((from_chunk + 2)*kv_size) >= tokens.size()) { fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); return false; } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); - tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk * kv_size); + tokens.erase(tokens.begin(), tokens.begin() + from_chunk * kv_size); } - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, - n_ctx); + if (int(tokens.size()) < 2*kv_size) { + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2 * kv_size, + kv_size); fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return false; } @@ -359,7 +359,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool prob_history.resize(tokens.size()); } - const int n_chunk_max = tokens.size() / n_ctx; + const int n_chunk_max = tokens.size() / kv_size; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); @@ -373,16 +373,16 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool std::vector workers(std::thread::hardware_concurrency() - 1); - const int num_batches = (n_ctx + n_batch - 1) / n_batch; + const int num_batches = (kv_size + n_batch - 1) / n_batch; std::vector logits; if (compute_ppl && num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); + logits.reserve((size_t)kv_size * n_vocab); } for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; + const int start = i * kv_size; + const int end = start + kv_size; std::vector logits; @@ -431,11 +431,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool } if (compute_ppl) { - const int first = n_ctx/2; + const int first = kv_size / 2; const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; + count += kv_size - first - 1; printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); fflush(stdout); @@ -553,7 +553,7 @@ int main(int argc, char ** argv) { } params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); + params.n_batch = std::min(params.n_batch, params.kv_size); print_build_info(); @@ -593,9 +593,9 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); - if (params.n_ctx > n_ctx_train) { + if (params.kv_size > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); + __func__, n_ctx_train, params.kv_size); } // print system information diff --git a/examples/infill/README.md b/examples/infill/README.md index 8c97f719b..aaddf8fdd 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead. +- `-kv N`, `--kv-size N`: Specify the total size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 92c67b7cf..f5ccdc3db 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -135,9 +135,9 @@ int main(int argc, char ** argv) { return 0; } - if (params.n_ctx != 0 && params.n_ctx < 8) { + if (params.kv_size != 0 && params.kv_size < 8) { LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; + params.kv_size = 8; } if (params.instruct) { printf("\n************\n"); @@ -225,12 +225,12 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); + const int kv_size = llama_kv_size(ctx); + LOG("kv_size: %d\n", kv_size); - if (n_ctx > n_ctx_train) { + if (kv_size > n_ctx_train) { LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + __func__, n_ctx_train, kv_size); } // print system information @@ -291,8 +291,8 @@ int main(int argc, char ** argv) { LOG("guidance_offset: %s", log_tostr(guidance_offset)); } - if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + if ((int) embd_inp.size() > kv_size - 4) { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4); return 1; } @@ -366,7 +366,7 @@ int main(int argc, char ** argv) { } } LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); LOG_TEE("\n##### Infill mode #####\n\n"); @@ -416,9 +416,9 @@ int main(int argc, char ** argv) { while (n_remain != 0 || params.interactive) { // predict if (!embd.empty()) { - // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // Note: kv_size - 4 here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. - int max_embd_size = n_ctx - 4; + int max_embd_size = kv_size - 4; // Ensure the input doesn't exceed the context size by truncating embd if necessary. if ((int) embd.size() > max_embd_size) { @@ -434,8 +434,8 @@ int main(int argc, char ** argv) { // infinite text generation via context swapping // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > kv_size) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; @@ -444,8 +444,8 @@ int main(int argc, char ** argv) { const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); + LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, kv_size, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae..428994111 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -514,7 +514,7 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; + cparams.kv_size = n_prompt + n_gen; cparams.n_batch = n_batch; cparams.type_k = type_k; cparams.type_v = type_v; diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 58fcf40c6..3c006975a 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -68,8 +68,8 @@ actor LlamaContext { print("Using \(n_threads) threads") var ctx_params = llama_context_default_params() - ctx_params.seed = 1234 - ctx_params.n_ctx = 2048 + ctx_params.seed = 1234 + ctx_params.kv_size = 2048 ctx_params.n_threads = UInt32(n_threads) ctx_params.n_threads_batch = UInt32(n_threads) @@ -112,13 +112,13 @@ actor LlamaContext { tokens_list = tokenize(text: text, add_bos: true) temporary_invalid_cchars = [] - let n_ctx = llama_n_ctx(context) + let kv_size = llama_kv_size(context) let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) - print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)") + print("\n n_len = \(n_len), kv_size = \(kv_size), n_kv_req = \(n_kv_req)") - if n_kv_req > n_ctx { - print("error: n_kv_req > n_ctx, the required KV cache size is not big enough") + if n_kv_req > kv_size { + print("error: n_kv_req > kv_size, the required KV cache size is not big enough") } for id in tokens_list { diff --git a/examples/llama2-13b.sh b/examples/llama2-13b.sh index 92b3f6dd8..bac0d4190 100755 --- a/examples/llama2-13b.sh +++ b/examples/llama2-13b.sh @@ -9,7 +9,7 @@ cd .. ./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \ --color \ - --ctx_size 2048 \ + --kv_size 2048 \ -n -1 \ -ins -b 256 \ --top_k 10000 \ diff --git a/examples/llama2.sh b/examples/llama2.sh index 221b37553..0f8f42140 100755 --- a/examples/llama2.sh +++ b/examples/llama2.sh @@ -9,7 +9,7 @@ cd .. ./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \ --color \ - --ctx_size 2048 \ + --kv_size 2048 \ -n -1 \ -ins -b 256 \ --top_k 10000 \ diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e29da6cb2..d2d681bbe 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -230,7 +230,7 @@ static struct llava_context * llava_init(gpt_params * params) { } llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); - ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + ctx_params.kv_size = params->kv_size < 2048 ? 2048 : params->kv_size; // we need a longer context size to process image embeddings llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4cb65a07b..5dc035775 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -103,15 +103,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector const size_t num_images = num_patches_width * num_patches_height + 1; // TODO: size calculation is not calculated - it's only tens of MB - size_t ctx_size = 0; + size_t kv_size = 0; { - ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features - ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); + kv_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features + kv_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); } struct ggml_init_params params { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ kv_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API }; diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index e2551e7a4..a7b34884a 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -73,8 +73,8 @@ int main(int argc, char ** argv) { inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); all = inp; - const int max_context_size = llama_n_ctx(ctx); - const int max_tokens_list_size = max_context_size - 4; + const int max_kv_size = llama_kv_size(ctx); + const int max_tokens_list_size = max_kv_size - 4; if ((int) inp.size() > max_tokens_list_size) { fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); @@ -117,7 +117,7 @@ int main(int argc, char ** argv) { // seq_id == 0 : the current input token // seq_id [1, W] : tokens from the past N - 1 Jacobi iterations // seq_id [W + 1, W + G] : verification n-grams - llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); + llama_batch batch = llama_batch_init(params.kv_size, 0, W + G + 1); // target model sampling context struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index b53fae110..f04c31780 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -47,8 +47,8 @@ int main(int argc, char ** argv){ std::vector inp; inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); - const int max_context_size = llama_n_ctx(ctx); - const int max_tokens_list_size = max_context_size - 4; + const int max_kv_size = llama_kv_size(ctx); + const int max_tokens_list_size = max_kv_size - 4; if ((int) inp.size() > max_tokens_list_size) { fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); @@ -86,7 +86,7 @@ int main(int argc, char ** argv){ std::vector draft; - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1); + llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, 1); // debug struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1); diff --git a/examples/main/README.md b/examples/main/README.md index 7f84e4262..6e665233b 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -70,7 +70,8 @@ In this section, we cover the most commonly used options for running the `main` - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead. +- `-kv N`, `--kv-size N`: Set the size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. ## Input Prompts @@ -134,15 +135,15 @@ By understanding and utilizing these interaction options, you can create engagin During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. -### Context Size +### KV Context Size -The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations. +The `--kv-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations. -- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. +- `-c N, --kv-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. ### Extended Context Size -Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. +Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--kv-size` to 32768 (32k) and `--rope-scale` to 8. - `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. @@ -152,7 +153,7 @@ The `--keep` option allows users to retain the original prompt when the model ru - `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. -By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation. +By utilizing context management options like `--kv-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation. ## Generation Flags @@ -181,12 +182,12 @@ Example usage: `--temp 0.5` ### Repeat Penalty - `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1). -- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). +- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size). - `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1. -The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). +The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`kv-size`). Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f5d2f4893..6af6384f6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,9 +157,9 @@ int main(int argc, char ** argv) { return 0; } - if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; + if (params.kv_size != 0 && params.kv_size < 8) { + LOG_TEE("%s: warning: minimum KV size is 8, using minimum size.\n", __func__); + params.kv_size = 8; } if (params.rope_freq_base != 0.0) { @@ -208,12 +208,12 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); + const int kv_size = llama_kv_size(ctx); + LOG("kv_size: %d\n", kv_size); - if (n_ctx > n_ctx_train) { + if (kv_size > n_ctx_train) { LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + __func__, n_ctx_train, kv_size); } // print system information @@ -233,7 +233,7 @@ int main(int argc, char ** argv) { LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty - session_tokens.resize(n_ctx); + session_tokens.resize(kv_size); size_t n_token_count_out = 0; if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); @@ -289,8 +289,8 @@ int main(int argc, char ** argv) { LOG("guidance_offset: %s", log_tostr(guidance_offset)); } - if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + if ((int) embd_inp.size() > kv_size - 4) { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4); return 1; } @@ -450,7 +450,7 @@ int main(int argc, char ** argv) { } LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -463,7 +463,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT + //GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * grp_attn_n"); // NOLINT LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } LOG_TEE("\n\n"); @@ -514,9 +514,9 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { - // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via + // Note: (kv_size - 4) here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. - int max_embd_size = n_ctx - 4; + int max_embd_size = kv_size - 4; // Ensure the input doesn't exceed the context size by truncating embd if necessary. if ((int) embd.size() > max_embd_size) { @@ -533,8 +533,8 @@ int main(int argc, char ** argv) { // infinite text generation via context shifting // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > kv_size) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; @@ -543,8 +543,8 @@ int main(int argc, char ** argv) { const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); + LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, kv_size, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); @@ -666,7 +666,7 @@ int main(int argc, char ** argv) { LOG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, kv_size); } } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7d11fcd59..dd000588e 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -152,7 +152,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); fflush(stderr); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); std::vector clients(n_clients); for (size_t i = 0; i < clients.size(); ++i) { @@ -169,7 +169,7 @@ int main(int argc, char ** argv) { // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time - llama_batch batch = llama_batch_init(n_ctx, 0, 1); + llama_batch batch = llama_batch_init(kv_size, 0, 1); int32_t n_total_prompt = 0; int32_t n_total_gen = 0; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e12a1cdf1..0b9f9c14e 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = seed; - ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; + ctx_params.kv_size = llama_n_ctx_train(model)*n_grp + n_keep; ctx_params.n_batch = 512; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -121,12 +121,12 @@ int main(int argc, char ** argv) { // total length of the sequences including the prompt const int n_len = n_tokens_all + n_predict; - const int n_ctx = llama_n_ctx(ctx) - n_keep; - const int n_kv_req = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx) - n_keep; + const int n_kv_req = llama_kv_size(ctx); const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch); + LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, kv_size, n_kv_req, n_grp, n_batch); // print the prompt token-by-token @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { int n_past = 0; // fill the KV cache - for (int i = 0; i < n_ctx; i += n_batch) { + for (int i = 0; i < kv_size; i += n_batch) { if (i > 0 && n_grp > 1) { // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp const int ib = i/n_batch - 1; @@ -174,13 +174,13 @@ int main(int argc, char ** argv) { } } - for (int i = n_ctx; i < n_tokens_all; i += n_batch) { + for (int i = kv_size; i < n_tokens_all; i += n_batch) { const int n_discard = n_batch; LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard); n_past -= n_discard; @@ -203,13 +203,13 @@ int main(int argc, char ** argv) { } { - const int n_discard = n_past - n_ctx + n_predict; + const int n_discard = n_past - kv_size + n_predict; if (n_discard > 0) { LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard); n_past -= n_discard; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9ec989389..6d1a5f677 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -320,11 +320,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, - n_ctx); + if (int(tokens.size()) < 2*kv_size) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size, + kv_size); fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -340,13 +340,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, -1, logit_history, prob_history}; } - const int calc_chunk = n_ctx; + const int calc_chunk = kv_size; fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); if (int(tokens.size()) <= calc_chunk) { - fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, - tokens.size(), n_ctx, params.ppl_stride); + fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n", __func__, + tokens.size(), kv_size, params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } @@ -414,8 +414,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); - for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { + //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.kv_size - params.ppl_stride + start, params.kv_size + start); + for (int j = kv_size - params.ppl_stride - 1; j < kv_size - 1; ++j) { // Calculate probability of next token, given the previous ones. const std::vector tok_logits( @@ -453,7 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // BOS tokens will be added for each chunk before eval const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); std::ofstream logits_stream; if (!params.logits_file.empty()) { @@ -464,7 +464,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); logits_stream.write("_logits_", 8); - logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); + logits_stream.write(reinterpret_cast(&kv_size), sizeof(kv_size)); } auto tim1 = std::chrono::high_resolution_clock::now(); @@ -475,9 +475,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, - n_ctx); + if (int(tokens.size()) < 2*kv_size) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size, + kv_size); fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -488,7 +488,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector prob_history; prob_history.resize(tokens.size()); - const int n_chunk_max = tokens.size() / n_ctx; + const int n_chunk_max = tokens.size() / kv_size; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); @@ -498,11 +498,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par double nll = 0.0; double nll2 = 0.0; - const int num_batches = (n_ctx + n_batch - 1) / n_batch; + const int num_batches = (kv_size + n_batch - 1) / n_batch; std::vector logits; if (num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); + logits.reserve((size_t)kv_size * n_vocab); } fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); @@ -513,14 +513,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (!params.logits_file.empty()) { logits_stream.write((const char *)&n_vocab, sizeof(n_vocab)); logits_stream.write((const char *)&n_chunk, sizeof(n_chunk)); - logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0])); + logits_stream.write((const char *)tokens.data(), n_chunk * kv_size * sizeof(tokens[0])); const int nv = 2*((n_vocab + 1)/2) + 4; - log_probs.resize(n_ctx * nv); + log_probs.resize(kv_size * nv); } for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; + const int start = i * kv_size; + const int end = start + kv_size; const auto t_start = std::chrono::high_resolution_clock::now(); @@ -566,7 +566,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - // We get the logits for all the tokens in the context window (params.n_ctx) + // We get the logits for all the tokens in the context window (params.kv_size) // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, // calculate the perplexity over the last half of the window (so the model always has // some context to predict the token). @@ -578,16 +578,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - const int first = n_ctx/2; + const int first = kv_size/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); if (!params.logits_file.empty()) { - process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first, workers, log_probs, nll, nll2); } else { - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); } - count += n_ctx - first - 1; + count += kv_size - first - 1; // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { @@ -596,7 +596,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par double av = nll/count; double av2 = nll2/count - av*av; if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + printf("%8d %.4lf %4lf %4lf\n", i*kv_size, std::exp(nll / count), av, av2); } fflush(stdout); @@ -805,16 +805,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; const int max_seq = 4*max_tasks_per_batch; - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(kv_size, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(n_vocab*kv_size); std::vector> eval_pairs; std::vector eval_results; @@ -832,7 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // each task has 4 unique seuqnce ids - one for each ending // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence - while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) { + while (n_cur + (int) hs_data[i1].required_tokens <= kv_size) { auto & hs_cur = hs_data[i1]; const int s0 = 4*(i1 - i0); @@ -1082,16 +1082,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); const int n_batch = params.n_batch; const int max_tasks_per_batch = 128; const int max_seq = 2*max_tasks_per_batch; - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(kv_size, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(n_vocab*kv_size); std::vector> eval_pairs; std::vector eval_results; @@ -1108,7 +1108,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { llama_batch_clear(batch); - while (n_cur + (int) data[i1].required_tokens <= n_ctx) { + while (n_cur + (int) data[i1].required_tokens <= kv_size) { const int s0 = 2*(i1 - i0); if (s0 + 2 > max_seq) { break; @@ -1434,16 +1434,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params printf("\ntask\tacc_norm\n"); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; const int max_seq = 4*max_tasks_per_batch; - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(kv_size, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(n_vocab*kv_size); std::vector> eval_pairs; std::vector eval_results; @@ -1467,7 +1467,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence int s0 = 0; - while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) { + while (n_cur + (int) tasks[i1].required_tokens <= kv_size) { auto& cur_task = tasks[i1]; int num_answers = cur_task.seq_tokens.size(); @@ -1620,11 +1620,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } } - uint32_t n_ctx; - in.read((char *)&n_ctx, sizeof(n_ctx)); - if (n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", - __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); + uint32_t kv_size; + in.read((char *)&kv_size, sizeof(kv_size)); + if (kv_size > llama_kv_size(ctx)) { + fprintf(stderr, "%s: %s has been computed with %u, while the current KV Cache size is %d. Increase it with -kv and retry\n", + __func__, params.logits_file.c_str(), kv_size, params.kv_size); } int n_vocab, n_chunk; @@ -1638,22 +1638,22 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } - std::vector tokens(n_ctx * n_chunk); + std::vector tokens(kv_size * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; } const int n_batch = params.n_batch; - const int num_batches = (n_ctx + n_batch - 1)/n_batch; + const int num_batches = (kv_size + n_batch - 1)/n_batch; const int nv = 2*((n_vocab + 1)/2) + 4; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); - std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); + std::vector log_probs_uint16(size_t(kv_size - 1 - kv_size/2) * nv); + std::vector kld_values(size_t(kv_size - 1 - kv_size /2)*n_chunk); std::vector logits; if (num_batches > 1) { - logits.reserve(n_ctx * n_vocab); + logits.reserve(kv_size * n_vocab); } std::vector workers(std::thread::hardware_concurrency() - 1); @@ -1672,8 +1672,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { auto kld_ptr = kld_values.data(); for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; + const int start = i * kv_size; + const int end = start + kv_size; const auto t_start = std::chrono::high_resolution_clock::now(); @@ -1726,11 +1726,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n"); } - const int first = n_ctx/2; + const int first = kv_size/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first, workers, log_probs_uint16, kld, kld_ptr); - kld_ptr += n_ctx - 1 - first; + kld_ptr += kv_size - 1 - first; auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count); @@ -1788,12 +1788,12 @@ int main(int argc, char ** argv) { } params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); + params.n_batch = std::min(params.n_batch, params.kv_size); if (params.ppl_stride > 0) { - fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", - params.n_ctx, params.n_ctx + params.ppl_stride/2); - params.n_ctx += params.ppl_stride/2; + fprintf(stderr, "Will perform strided perplexity calculation -> adjusting KV size from %d to %d\n", + params.kv_size, params.kv_size + params.ppl_stride / 2); + params.kv_size += params.ppl_stride/2; } print_build_info(); @@ -1823,9 +1823,9 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); - if (params.n_ctx > n_ctx_train) { + if (params.kv_size > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); + __func__, n_ctx_train, params.kv_size); } // print system information diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 1d05f1391..bddb51065 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } auto cparams = llama_context_default_params(); - cparams.n_ctx = 256; + cparams.kv_size = 256; cparams.seed = 1; ctx = llama_new_context_with_model(model, cparams); diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh index 17fedc2b1..ff8c340d3 100755 --- a/examples/server-llama2-13B.sh +++ b/examples/server-llama2-13B.sh @@ -12,7 +12,7 @@ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt} N_THREAD="${N_THREAD:-12}" # Note: you can also override the generation options by specifying them on the command line: -GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" +GEN_OPTIONS="${GEN_OPTIONS:---kv_size 4096 --batch-size 1024}" # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS diff --git a/examples/server/README.md b/examples/server/README.md index 1dbc4f6c9..cef11ee14 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -174,7 +174,7 @@ node index.js `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). - `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). + `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size). `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). @@ -239,7 +239,7 @@ Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) -- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model` +- `generation_settings`: The provided options above excluding `prompt` but including `kv_size`, `model` - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token @@ -249,7 +249,7 @@ Notice that each `probs` is an array of length `n_probs`. - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`) - `tokens_evaluated`: Number of tokens evaluated in total from the prompt -- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) +- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the KV size (`kv_size`) - **POST** `/tokenize`: Tokenize a given text. @@ -404,7 +404,7 @@ Notice that each `probs` is an array of length `n_probs`. "mirostat_eta": 0.10000000149011612, "mirostat_tau": 5.0, "model": "llama-2-7b-32k-instruct.Q2_K.gguf", - "n_ctx": 2048, + "kv_size": 2048, "n_keep": 0, "n_predict": 100000, "n_probs": 0, diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e66c7bcdf..7182c06dd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -155,7 +155,7 @@ struct llama_client_slot int64_t t_last_used = -1; // generation props - int32_t n_ctx = 0; // context size per slot + int32_t kv_size = 0; // KV size per slot int32_t n_past = 0; int32_t n_decoded = 0; int32_t n_remaining = -1; @@ -325,7 +325,7 @@ struct llama_server_context bool all_slots_are_idle = false; bool add_bos_token = true; - int32_t n_ctx; // total context for all clients / slots + int32_t kv_size; // total KV Cache for all clients / slots // system prompt bool system_need_update = false; @@ -369,8 +369,8 @@ struct llama_server_context return false; } - if (params.n_ctx < 2048) { // request larger context for the image embedding - params.n_ctx = 2048; + if (params.kv_size < 2048) { // request larger context for the image embedding + params.kv_size = 2048; } } @@ -392,7 +392,7 @@ struct llama_server_context } } - n_ctx = llama_n_ctx(ctx); + kv_size = llama_kv_size(ctx); add_bos_token = llama_should_add_bos_token(model); @@ -403,7 +403,7 @@ struct llama_server_context // create slots all_slots_are_idle = true; - const int32_t n_ctx_slot = n_ctx / params.n_parallel; + const int32_t kv_size_slot = kv_size / params.n_parallel; LOG_TEE("Available slots:\n"); for (int i = 0; i < params.n_parallel; i++) @@ -411,10 +411,10 @@ struct llama_server_context llama_client_slot slot; slot.id = i; - slot.n_ctx = n_ctx_slot; + slot.kv_size = kv_size_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_TEE(" -> Slot %i - max KV Size: %i\n", slot.id, kv_size_slot); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -423,7 +423,7 @@ struct llama_server_context GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT + //GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * ga_n"); // NOLINT LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); } @@ -439,7 +439,7 @@ struct llama_server_context default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props["seed"] = -1; - batch = llama_batch_init(n_ctx, 0, params.n_parallel); + batch = llama_batch_init(kv_size, 0, params.n_parallel); } std::vector tokenize(const json & json_prompt, bool add_bos) const @@ -1065,7 +1065,7 @@ struct llama_server_context } return json { - {"n_ctx", slot.n_ctx}, + {"kv_size", slot.kv_size}, {"n_predict", slot.n_predict}, {"model", params.model_alias}, {"seed", slot.params.seed}, @@ -1474,7 +1474,7 @@ struct llama_server_context { if (slot.ga_n == 1) { - if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) + if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.kv_size) { // Shift context const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; @@ -1496,7 +1496,7 @@ struct llama_server_context slot.truncated = true; LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, + { "kv_size", kv_size }, { "n_keep", params.n_keep }, { "n_left", n_left }, }); @@ -1598,12 +1598,12 @@ struct llama_server_context { slot.params.n_keep = slot.num_prompt_tokens; } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + slot.params.n_keep = std::min(slot.kv_size - 4, slot.params.n_keep); // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) + if (slot.num_prompt_tokens >= slot.kv_size) { - const int n_left = slot.n_ctx - slot.params.n_keep; + const int n_left = slot.kv_size - slot.params.n_keep; const int n_block_size = n_left / 2; const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; @@ -1611,7 +1611,7 @@ struct llama_server_context new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, + {"kv_size", slot.kv_size}, {"n_keep", slot.params.n_keep}, {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, @@ -1620,7 +1620,7 @@ struct llama_server_context prompt_tokens = new_tokens; slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + GGML_ASSERT(slot.num_prompt_tokens < slot.kv_size); } if (!slot.params.cache_prompt) @@ -1873,7 +1873,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.n_ctx); + printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size); printf(" --rope-scaling {none,linear,yarn}\n"); printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); @@ -2043,16 +2043,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, server_print_usage(argv[0], default_params, default_sparams); exit(0); } - else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") + else if (arg == "-c" || arg == "--ctx-size" || arg == "--kv_size") { if (++i >= argc) { invalid_param = true; break; } - params.n_ctx = std::stoi(argv[i]); - LOG_WARNING("-c,--ctx-size,--ctx_size option is deprecated, use --kv-size instead", - {{"--ctx_size", params.n_ctx}}); + params.kv_size = std::stoi(argv[i]); + LOG_WARNING("-c,--ctx-size,--kv_size option is deprecated, use --kv-size instead", + {{"--kv_size", params.kv_size}}); } else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") { @@ -2061,7 +2061,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - params.n_ctx = std::stoi(argv[i]); + params.kv_size = std::stoi(argv[i]); } else if (arg == "--rope-scaling") { diff --git a/examples/simple/README.md b/examples/simple/README.md index 5d24b1046..d845b25be 100644 --- a/examples/simple/README.md +++ b/examples/simple/README.md @@ -7,7 +7,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for g ... -main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32 +main: n_len = 32, kv_size = 2048, n_parallel = 1, n_kv_req = 32 Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 39e2d8ea4..af4d4e1a5 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -52,7 +52,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = 2048; + ctx_params.kv_size = 2048; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -68,15 +68,15 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(ctx, params.prompt, true); - const int n_ctx = llama_n_ctx(ctx); + const int kv_size = llama_kv_size(ctx); const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req); + LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d\n", __func__, n_len, kv_size, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__); + if (n_kv_req > kv_size) { + LOG_TEE("%s: error: n_kv_req > kv_size, the required KV cache size is not big enough\n", __func__); + LOG_TEE("%s: either reduce n_len or increase kv_size\n", __func__); return 1; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 3848791d4..01e43d5a4 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -116,7 +116,7 @@ int main(int argc, char ** argv) { std::vector inp; inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); - const int max_context_size = llama_n_ctx(ctx_tgt); + const int max_context_size = llama_kv_size(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { @@ -172,8 +172,8 @@ int main(int argc, char ** argv) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); + llama_batch batch_dft = llama_batch_init(params.kv_size, 0, 1); + llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, n_seq_dft); const auto t_dec_start = ggml_time_us(); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index e78ab185d..ffa56a2e7 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -22,7 +22,7 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; + uint32_t kv_size = 512; uint32_t n_embd = 4096; uint32_t n_head = 32; uint32_t n_layer = 32; @@ -112,7 +112,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: kv_size: %u\n", __func__, params->kv_size); printf("%s: n_embd: %u\n", __func__, params->n_embd); printf("%s: n_head: %u\n", __func__, params->n_head); printf("%s: n_ff: %u\n", __func__, params->n_ff); @@ -272,7 +272,7 @@ static struct ggml_tensor * llama_build_train_graphs( const int n_past = 0; const int N = n_tokens; const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; + const int kv_size = hparams.kv_size; const int n_vocab = hparams.n_vocab; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -295,13 +295,13 @@ static struct ggml_tensor * llama_build_train_graphs( ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale] (struct ggml_tensor * t) -> struct ggml_tensor * { // not capturing these, to silcence warnings const int rope_mode = 0; return ggml_rope_custom( - ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + ctx, t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ); }; @@ -487,8 +487,8 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here - GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); + // kv_size was not saved in earlier checkpoint file versions, so we make it optional here + GGUF_GET_KEY(fctx, model->hparams.kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); @@ -543,7 +543,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); // set hparams - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx ); + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size ); gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd ); gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff ); gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head ); @@ -945,7 +945,7 @@ int main(int argc, char ** argv) { struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lmodel); - model.hparams.n_ctx = params.common.n_ctx; + model.hparams.kv_size = params.common.n_ctx; model.hparams.n_embd = params.n_embd; model.hparams.n_head = params.n_head; model.hparams.n_layer = params.n_layer; @@ -982,9 +982,9 @@ int main(int argc, char ** argv) { printf("%s: init model\n", __func__); bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train); if (existed) { - // overwrite last n_ctx with user provided n_ctx + // overwrite last kv_size with user provided kv_size if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; + model.hparams.kv_size = params.common.n_ctx; } const bool opt_past_changed = opt->params.past != params.common.opt_past; @@ -1031,7 +1031,7 @@ int main(int argc, char ** argv) { printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); printf("%s: opt iter %d\n", __func__, opt->iter); - int n_tokens = model.hparams.n_ctx; + int n_tokens = model.hparams.kv_size; int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; diff --git a/llama.cpp b/llama.cpp index 5cfebb3b1..58edaab54 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1607,7 +1607,7 @@ struct llama_hparams { }; struct llama_cparams { - uint32_t n_ctx; // context size used during inference + uint32_t kv_size; // KV Cache size used during inference uint32_t n_batch; uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -1923,9 +1923,9 @@ struct llama_context { struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] - struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] - struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] - struct ggml_tensor * inp_K_shift; // I32 [n_ctx] + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_pos; // F32 [kv_size] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] @@ -1943,7 +1943,7 @@ static bool llama_kv_cache_init( const llama_model & model, ggml_type ktype, ggml_type vtype, - uint32_t n_ctx, + uint32_t kv_size, bool offload) { const struct llama_hparams & hparams = model.hparams; @@ -1954,11 +1954,11 @@ static bool llama_kv_cache_init( cache.has_shift = false; cache.head = 0; - cache.size = n_ctx; + cache.size = kv_size; cache.used = 0; cache.cells.clear(); - cache.cells.resize(n_ctx); + cache.cells.resize(kv_size); #ifdef GGML_USE_CLBLAST offload = false; @@ -1997,8 +1997,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); + ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -2029,19 +2029,19 @@ static bool llama_kv_cache_init( static bool llama_kv_cache_find_slot( struct llama_kv_cache & cache, const struct llama_batch & batch) { - const uint32_t n_ctx = cache.size; + const uint32_t kv_size = cache.size; const uint32_t n_tokens = batch.n_tokens; - if (n_tokens > n_ctx) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); + if (n_tokens > kv_size) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > kv_size=%d\n", __func__, n_tokens, kv_size); return false; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > n_ctx) { - n_tested += n_ctx - cache.head; + if (cache.head + n_tokens > kv_size) { + n_tested += kv_size - cache.head; cache.head = 0; continue; } @@ -2060,7 +2060,7 @@ static bool llama_kv_cache_find_slot( break; } - if (n_tested >= n_ctx) { + if (n_tested >= kv_size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } @@ -3692,11 +3692,11 @@ static bool llm_load_tensors( } // create one context per buffer type - size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + size_t kv_size = ggml_tensor_overhead() * ml.n_tensors; std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ kv_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -3708,7 +3708,7 @@ static bool llm_load_tensors( model.ctxs.push_back(ctx); } - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml KV size = %7.2f MiB\n", __func__, model.ctxs.size()*kv_size/1024.0/1024.0); // create tensors for the weights { @@ -4584,7 +4584,7 @@ static void llm_build_k_shift( struct ggml_cgraph * graph, struct ggml_tensor * K_shift, llm_rope_type type, - int64_t n_ctx, + int64_t kv_size, float freq_base, float freq_scale, const llm_build_cb & cb) { @@ -4612,7 +4612,7 @@ static void llm_build_k_shift( // we rotate only the first n_rot dimensions ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, + n_embd_head_k, n_head_kv, kv_size, ggml_row_size(kv.k_l[il]->type, n_embd_head_k), ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), 0), @@ -4630,7 +4630,7 @@ static void llm_build_kv_store( struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, - int64_t n_ctx, + int64_t kv_size, int32_t n_tokens, int32_t kv_head, const llm_build_cb & cb, @@ -4648,7 +4648,7 @@ static void llm_build_kv_store( cb(k_cache_view, "k_cache_view", il); struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv.v_l[il]), + ( kv_size)*ggml_element_size(kv.v_l[il]), (kv_head)*ggml_element_size(kv.v_l[il])); cb(v_cache_view, "v_cache_view", il); @@ -4792,7 +4792,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, struct ggml_tensor * kq_pos, - int64_t n_ctx, + int64_t kv_size, int32_t n_tokens, int32_t n_kv, float kq_scale, @@ -4851,8 +4851,8 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, + ggml_element_size(kv.v_l[il])*kv_size, + ggml_element_size(kv.v_l[il])*kv_size *n_embd_head_v, 0); cb(v, "v", il); @@ -4892,7 +4892,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, struct ggml_tensor * kq_pos, - int64_t n_ctx, + int64_t kv_size, int32_t n_tokens, int32_t kv_head, int32_t n_kv, @@ -4906,11 +4906,11 @@ static struct ggml_tensor * llm_build_kv( ggml_build_forward_expand(graph, k_cur); ggml_build_forward_expand(graph, v_cur); - llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); + llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, kv_size, n_tokens, kv_head, cb, il); struct ggml_tensor * cur; cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, - q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); + q_cur, kq_mask, kq_pos, kv_size, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -4926,7 +4926,7 @@ struct llm_build_context { const int64_t n_embd; const int64_t n_layer; - const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t kv_size; // user-specified KV Cache size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; const int64_t n_embd_head_k; @@ -4946,7 +4946,7 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_size) const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; @@ -4973,7 +4973,7 @@ struct llm_build_context { kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), - n_ctx (cparams.n_ctx), + kv_size (cparams.kv_size), n_head (hparams.n_head), n_head_kv (hparams.n_head_kv), n_embd_head_k (hparams.n_embd_head_k), @@ -4991,14 +4991,14 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), - n_kv (worst_case ? n_ctx : kv_self.n), - kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_kv (worst_case ? kv_size : kv_self.n), + kv_head (worst_case ? kv_size - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { - // all initializations should be done in init() + // all initializations should be done in init() } void init() { @@ -5041,7 +5041,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5093,7 +5093,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5229,7 +5229,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5277,7 +5277,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5347,7 +5347,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5401,7 +5401,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5500,7 +5500,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5565,7 +5565,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5705,7 +5705,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5798,7 +5798,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5899,7 +5899,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } else { // compute Q and K and RoPE them @@ -5930,7 +5930,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6043,7 +6043,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6140,7 +6140,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6209,7 +6209,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6262,7 +6262,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6332,7 +6332,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6377,7 +6377,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6446,7 +6446,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6498,7 +6498,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6567,7 +6567,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6625,7 +6625,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -6689,7 +6689,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6728,7 +6728,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = cur; @@ -6827,7 +6827,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6894,7 +6894,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6936,7 +6936,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7002,7 +7002,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -7054,7 +7054,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7121,7 +7121,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -7172,8 +7172,8 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7253,7 +7253,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -7304,8 +7304,8 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7549,13 +7549,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } if (kv_self.has_shift) { - const int64_t n_ctx = cparams.n_ctx; + const int64_t kv_size = cparams.kv_size; assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); int32_t * data = (int32_t *) lctx.inp_K_shift->data; - for (int i = 0; i < n_ctx; ++i) { + for (int i = 0; i < kv_size; ++i) { data[i] = lctx.kv_self.cells[i].delta; } } @@ -7694,7 +7694,7 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + kv_self.n = std::min((int32_t) cparams.kv_size, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); //kv_self.n = llama_kv_cache_cell_max(kv_self); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); @@ -11148,7 +11148,7 @@ struct llama_model_params llama_model_default_params() { struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, + /*.kv_size =*/ 512, /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, @@ -11328,7 +11328,7 @@ struct llama_context * llama_new_context_with_model( cparams.offload_kqv = params.offload_kqv; cparams.do_pooling = params.do_pooling; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.kv_size = params.kv_size == 0 ? hparams.n_ctx_train : params.kv_size; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; @@ -11356,7 +11356,7 @@ struct llama_context * llama_new_context_with_model( params.seed = time(NULL); } - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: kv_size = %u\n", __func__, cparams.kv_size); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); @@ -11447,7 +11447,7 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(ctx->backend_cpu); if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, - cparams.n_ctx, cparams.offload_kqv)) { + cparams.kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -11490,9 +11490,9 @@ struct llama_context * llama_new_context_with_model( ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); - ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); - ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); + ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size, cparams.n_batch); + ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size); + ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.kv_size); ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); @@ -11531,8 +11531,8 @@ struct llama_context * llama_new_context_with_model( ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); // build worst-case graph - int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); - int n_past = cparams.n_ctx - n_tokens; + int n_tokens = (int)std::min(cparams.kv_size, cparams.n_batch); + int n_past = cparams.kv_size - n_tokens; llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); @@ -11565,7 +11565,7 @@ struct llama_context * llama_new_context_with_model( // Enter a blocking eval loop with dummy input, letting rank=0 drive the process // TODO: needs fix after #3228 GGML_ASSERT(false && "not implemented"); - //const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); + //const std::vector tmp(ctx->model.hparams.kv_size, llama_token_bos(ctx)); //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; llama_backend_free(); exit(1); @@ -11583,8 +11583,8 @@ const llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } -uint32_t llama_n_ctx(const struct llama_context * ctx) { - return ctx->cparams.n_ctx; +uint32_t llama_kv_size(const struct llama_context * ctx) { + return ctx->cparams.kv_size; } uint32_t llama_n_batch(const struct llama_context * ctx) { @@ -11982,7 +11982,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_layer = hparams.n_layer; const auto n_embd_k_gqa = hparams.n_embd_k_gqa(); const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); - const auto n_ctx = cparams.n_ctx; + const auto n_kv_req = cparams.kv_size; const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; @@ -12006,7 +12006,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat // v is not contiguous, copy row by row tmp_buf.resize(elt_size*kv_head); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_kv_req, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } @@ -12093,7 +12093,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const int n_layer = hparams.n_layer; const int n_embd_k_gqa = hparams.n_embd_k_gqa(); const int n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int n_ctx = cparams.n_ctx; + const int n_kv_req = cparams.kv_size; size_t kv_buf_size; uint32_t kv_head; @@ -12118,7 +12118,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // v is not contiguous, copy row by row size_t v_row_size = elt_size*kv_head; for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_kv_req, v_row_size); inp += v_row_size; } } diff --git a/llama.h b/llama.h index 5a97abcc9..b9f49ab31 100644 --- a/llama.h +++ b/llama.h @@ -217,7 +217,7 @@ extern "C" { struct llama_context_params { uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context, 0 = from model + uint32_t kv_size; // KV Cache size uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -347,7 +347,7 @@ extern "C" { LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); - LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_kv_size (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index a18252730..ba6a3b36a 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -8,7 +8,7 @@ import sys import yaml CLI_ARGS_MAIN_PERPLEXITY = [ - "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", + "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "kv-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", @@ -27,7 +27,7 @@ CLI_ARGS_LLAMA_BENCH = [ ] CLI_ARGS_SERVER = [ - "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base", + "alias", "batch-size", "kv-size", "embedding", "host", "memory-f32", "lora", "lora-base", "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", "threads", "verbose" diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ef37c5af2..a7d607f3d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1121,21 +1121,21 @@ struct test_rope : public test_case { const std::array ne; int n_dims; int mode; - int n_ctx; + int kv_size; std::string vars() override { - return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx); + return VARS_TO_STR5(type, ne, n_dims, mode, kv_size); } test_rope(ggml_type type = GGML_TYPE_F32, std::array ne = {10, 10, 10, 1}, int n_dims = 10, int mode = 0, int n_ctx = 512) - : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {} + : type(type), ne(ne), n_dims(n_dims), mode(mode), kv_size(n_ctx) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]); - ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx); + ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, kv_size); return out; } @@ -1145,7 +1145,7 @@ struct test_rope : public test_case { // pos std::vector data(ne[2]); for (int i = 0; i < ne[2]; i++) { - data[i] = rand() % n_ctx; + data[i] = rand() % kv_size; } ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int)); } else { @@ -1545,7 +1545,7 @@ struct llama_hparams { int32_t n_tokens; // llm_build_context - static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx + static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= kv_size static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads