mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 14:28:58 +01:00
examples : rely on new behavior of add_special
This commit is contained in:
parent
d1a1b614cd
commit
92591c125f
@ -2141,23 +2141,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + add_bos;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -221,14 +221,14 @@ void llama_batch_add(
|
|||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece
|
// tokenizes a token into a piece
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
@ -349,12 +349,13 @@ static void process_logits(
|
|||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
bool suff_rm_leading_spc = params.escape;
|
bool suff_rm_leading_spc = params.escape;
|
||||||
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
|||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
|
@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
|
||||||
|
|
||||||
std::string system_prompt, user_prompt;
|
std::string system_prompt, user_prompt;
|
||||||
size_t image_pos = prompt.find("<image>");
|
size_t image_pos = prompt.find("<image>");
|
||||||
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
|
||||||
|
@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
|
|||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
std::vector<llama_token> all;
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
all = inp;
|
all = inp;
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
@ -28,10 +28,8 @@ int main(int argc, char ** argv){
|
|||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,11 +34,8 @@ int main(int argc, char ** argv){
|
|||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
@ -42,11 +42,8 @@ int main(int argc, char ** argv){
|
|||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (params.chatml) {
|
if (params.chatml) {
|
||||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
}
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
|
|||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
// chatml prefix & suffix
|
// chatml prefix & suffix
|
||||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::ofstream logits_stream;
|
std::ofstream logits_stream;
|
||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
// The tasks should be randomized so the score stabilizes quickly.
|
// The tasks should be randomized so the score stabilizes quickly.
|
||||||
bool randomize_tasks = true;
|
bool randomize_tasks = true;
|
||||||
|
|
||||||
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j = 0; j < 4; j++) {
|
for (size_t j = 0; j < 4; j++) {
|
||||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the common prefix of the endings
|
// determine the common prefix of the endings
|
||||||
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||||
|
|
||||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||||
|
|
||||||
// Delete the selected random example from the prompt
|
// Delete the selected random example from the prompt
|
||||||
if (randomize_tasks) {
|
if (randomize_tasks) {
|
||||||
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||||
|
|
||||||
task.common_prefix = 0;
|
task.common_prefix = 0;
|
||||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||||
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
task.seq_tokens[0].size() - task.common_prefix +
|
task.seq_tokens[0].size() - task.common_prefix +
|
||||||
task.seq_tokens[1].size() - task.common_prefix;
|
task.seq_tokens[1].size() - task.common_prefix;
|
||||||
|
|
||||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
|
|||||||
std::vector<float> log_probs;
|
std::vector<float> log_probs;
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||||
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
|
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||||
}
|
}
|
||||||
auto min_len = task.seq_tokens.front().size();
|
auto min_len = task.seq_tokens.front().size();
|
||||||
for (auto& seq : task.seq_tokens) {
|
for (auto& seq : task.seq_tokens) {
|
||||||
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
n_task = params.multiple_choice_tasks;
|
n_task = params.multiple_choice_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
printf("%s: preparing task data", __func__);
|
printf("%s: preparing task data", __func__);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
if (n_task > 500) {
|
if (n_task > 500) {
|
||||||
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
std::atomic<int> n_bad(0);
|
std::atomic<int> n_bad(0);
|
||||||
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||||
int num_tasks = tasks.size();
|
int num_tasks = tasks.size();
|
||||||
int n_bad_local = 0;
|
int n_bad_local = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
}
|
}
|
||||||
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
||||||
for (int i = first; i < last; ++i) {
|
for (int i = first; i < last; ++i) {
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
|
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
int i_task = 0;
|
int i_task = 0;
|
||||||
for (auto& task : tasks) {
|
for (auto& task : tasks) {
|
||||||
++i_task;
|
++i_task;
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
|
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i_task%n_dot == 0) {
|
if (i_task%n_dot == 0) {
|
||||||
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||||
|
@ -685,6 +685,7 @@ struct server_context {
|
|||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -754,7 +755,7 @@ struct server_context {
|
|||||||
metrics.init();
|
metrics.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
@ -772,7 +773,7 @@ struct server_context {
|
|||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
@ -789,7 +790,7 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
@ -1054,7 +1055,7 @@ struct server_context {
|
|||||||
system_tokens.clear();
|
system_tokens.clear();
|
||||||
|
|
||||||
if (!system_prompt.empty()) {
|
if (!system_prompt.empty()) {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
@ -1809,7 +1810,7 @@ struct server_context {
|
|||||||
prefix_tokens.push_back(llama_token_middle(model));
|
prefix_tokens.push_back(llama_token_middle(model));
|
||||||
prompt_tokens = prefix_tokens;
|
prompt_tokens = prefix_tokens;
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||||
|
|
||||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
|
Loading…
Reference in New Issue
Block a user