mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-04 01:57:53 +01:00
llama : fix n_kv to never become 0
This commit is contained in:
parent
7b7472ee26
commit
e1067efbfa
16
llama.cpp
16
llama.cpp
@ -1025,7 +1025,7 @@ struct llama_kv_cache {
|
|||||||
uint32_t size = 0;
|
uint32_t size = 0;
|
||||||
|
|
||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t cell_max = 0;
|
uint32_t n = 0;
|
||||||
|
|
||||||
std::vector<llama_kv_cell> cells;
|
std::vector<llama_kv_cell> cells;
|
||||||
|
|
||||||
@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
||||||
@ -3007,7 +3007,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
||||||
@ -3410,7 +3410,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
||||||
@ -3771,7 +3771,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|||||||
const float norm_eps = hparams.f_norm_eps;
|
const float norm_eps = hparams.f_norm_eps;
|
||||||
|
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
@ -4102,8 +4102,10 @@ static int llama_decode_internal(
|
|||||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||||
// after enough generations, the benefit from this heuristic disappears
|
// after enough generations, the benefit from this heuristic disappears
|
||||||
// if we start defragmenting the cache, the benefit from this will be more important
|
// if we start defragmenting the cache, the benefit from this will be more important
|
||||||
kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
||||||
//printf("kv_self.cell_max = %d\n", kv_self.cell_max);
|
kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
|
||||||
|
|
||||||
|
//printf("kv_self.n = %d\n", kv_self.n);
|
||||||
|
|
||||||
ggml_allocr_reset(lctx.alloc);
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user