diff --git a/common/common.cpp b/common/common.cpp index e92dee7a7..c5c4d7508 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2690,10 +2690,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_token bos = llama_token_bos(model); llama_token eos = llama_token_eos(model); // some models (e.g. T5) don't have a BOS token - if (bos != -1) { + if (bos != LLAMA_TOKEN_NULL) { tmp.push_back(bos); } - tmp.push_back(eos); + if (eos != LLAMA_TOKEN_NULL) { + tmp.push_back(eos); + } + if (tmp.empty()) { + tmp.push_back(0); + } if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 6f163ce04..e1eade6cd 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -9,8 +9,11 @@ Feature: llama.cpp server And a model alias bert-bge-small And 42 as server seed And 2 slots - And 1024 as batch size - And 1024 as ubatch size + # the bert-bge-small model has context size of 512 + # since the generated prompts are as big as the batch size, we need to set the batch size to 512 + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20 + And 512 as batch size + And 512 as ubatch size And 2048 KV cache size And embeddings extraction Then the server is starting diff --git a/src/llama.cpp b/src/llama.cpp index 6bbaf9fc9..190564fa4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16066,6 +16066,13 @@ static int llama_decode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -16358,6 +16365,13 @@ static int llama_encode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams;