examples : rely on new behavior of add_special

This commit is contained in:
Jared Van Bortel 2024-04-04 18:12:33 -04:00
parent d1a1b614cd
commit 92591c125f
14 changed files with 53 additions and 69 deletions

View File

@ -2141,23 +2141,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_bos, bool add_special,
bool special) { bool parse_special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special); return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
} }
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_bos, bool add_special,
bool special) { bool parse_special) {
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + add_bos; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);

View File

@ -221,14 +221,14 @@ void llama_batch_add(
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_bos, bool add_special,
bool special = false); bool parse_special = false);
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_bos, bool add_special,
bool special = false); bool parse_special = false);
// tokenizes a token into a piece // tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`

View File

@ -349,12 +349,13 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

View File

@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", get_system_info(params).c_str());
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos); LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape; bool suff_rm_leading_spc = params.escape;
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();

View File

@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
int n_past = 0; int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
std::string system_prompt, user_prompt; std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<image>"); size_t image_pos = prompt.find("<image>");
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
} }
} }
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

View File

@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
// Tokenize the prompt // Tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp; std::vector<llama_token> inp;
std::vector<llama_token> all; std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); inp = ::llama_tokenize(ctx, params.prompt, true, true);
all = inp; all = inp;
const int max_context_size = llama_n_ctx(ctx); const int max_context_size = llama_n_ctx(ctx);

View File

@ -28,10 +28,8 @@ int main(int argc, char ** argv){
GGML_ASSERT(model != nullptr); GGML_ASSERT(model != nullptr);
// tokenize the prompt // tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); inp = ::llama_tokenize(ctx, params.prompt, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__); fprintf(stderr, "%s: tokenization done\n", __func__);

View File

@ -34,11 +34,8 @@ int main(int argc, char ** argv){
GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt // tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); inp = ::llama_tokenize(ctx, params.prompt, true, true);
llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; llama_ngram_cache ngram_cache_dynamic;

View File

@ -42,11 +42,8 @@ int main(int argc, char ** argv){
GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt // tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); inp = ::llama_tokenize(ctx, params.prompt, true, true);
llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; llama_ngram_cache ngram_cache_dynamic;

View File

@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos); LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
if (params.chatml) { if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
} }
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else { } else {
LOG("use session tokens\n"); LOG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// chatml prefix & suffix // chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true); const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());

View File

@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
std::ofstream logits_stream; std::ofstream logits_stream;
if (!params.logits_file.empty()) { if (!params.logits_file.empty()) {
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm); fprintf(stderr, "================================= is_spm = %d\n", is_spm);
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// The tasks should be randomized so the score stabilizes quickly. // The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true; bool randomize_tasks = true;
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j = 0; j < 4; j++) { for (size_t j = 0; j < 4; j++) {
hs_cur.ending[j] = prompt_lines[idx*6+2+j]; hs_cur.ending[j] = prompt_lines[idx*6+2+j];
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos); hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
} }
// determine the common prefix of the endings // determine the common prefix of the endings
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix; hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size()); //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
// Delete the selected random example from the prompt // Delete the selected random example from the prompt
if (randomize_tasks) { if (randomize_tasks) {
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
for (auto & task : data) { for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos); task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos); task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
task.common_prefix = 0; task.common_prefix = 0;
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix; task.seq_tokens[1].size() - task.common_prefix;
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size(); task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size(); task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
} }
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
std::vector<float> log_probs; std::vector<float> log_probs;
}; };
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) { static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) { if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) { if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__); printf("%s: found bad task with empty question and/or answers\n", __func__);
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
} }
return false; return false;
} }
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos)); task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
} }
auto min_len = task.seq_tokens.front().size(); auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) { for (auto& seq : task.seq_tokens) {
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
n_task = params.multiple_choice_tasks; n_task = params.multiple_choice_tasks;
} }
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
printf("%s: preparing task data", __func__); printf("%s: preparing task data", __func__);
fflush(stdout); fflush(stdout);
if (n_task > 500) { if (n_task > 500) {
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
fflush(stdout); fflush(stdout);
std::atomic<int> counter(0); std::atomic<int> counter(0);
std::atomic<int> n_bad(0); std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () { auto prepare = [&counter, &n_bad, &tasks, ctx] () {
int num_tasks = tasks.size(); int num_tasks = tasks.size();
int n_bad_local = 0; int n_bad_local = 0;
while (true) { while (true) {
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
} }
int last = std::min(first + K_TOKEN_CHUNK, num_tasks); int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
for (int i = first; i < last; ++i) { for (int i = first; i < last; ++i) {
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local; if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
} }
} }
}; };
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
int i_task = 0; int i_task = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i_task; ++i_task;
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) { if (!multiple_choice_prepare_one_task(ctx, task, true)) {
return; return;
} }
if (i_task%n_dot == 0) { if (i_task%n_dot == 0) {
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4; const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);

View File

@ -685,6 +685,7 @@ struct server_context {
n_ctx = llama_n_ctx(ctx); n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model); add_bos_token = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
return true; return true;
} }
@ -754,7 +755,7 @@ struct server_context {
metrics.init(); metrics.init();
} }
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const { std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
// TODO: currently, we tokenize using special tokens by default // TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates // but it's better compared to completely ignoring ChatML and other chat templates
@ -772,7 +773,7 @@ struct server_context {
std::vector<llama_token> p; std::vector<llama_token> p;
if (first) { if (first) {
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
first = false; first = false;
} else { } else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@ -789,7 +790,7 @@ struct server_context {
} }
} else { } else {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
} }
return prompt_tokens; return prompt_tokens;
@ -1054,7 +1055,7 @@ struct server_context {
system_tokens.clear(); system_tokens.clear();
if (!system_prompt.empty()) { if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); system_tokens = ::llama_tokenize(ctx, system_prompt, true);
llama_batch_clear(batch); llama_batch_clear(batch);
@ -1809,7 +1810,7 @@ struct server_context {
prefix_tokens.push_back(llama_token_middle(model)); prefix_tokens.push_back(llama_token_middle(model));
prompt_tokens = prefix_tokens; prompt_tokens = prefix_tokens;
} else { } else {
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
} }
slot.n_past = 0; slot.n_past = 0;

View File

@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
} }
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
const int max_context_size = llama_n_ctx(ctx_tgt); const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;

View File

@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, true); tokens = ::llama_tokenize(model, prompt, true, true);
for (int i = 0; i < (int) tokens.size(); i++) { for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) { if (printing_ids) {