mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
embedding : add EOS token if not present (#899)
This commit is contained in:
parent
77178eedc8
commit
044ec4b2a5
@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
|
|||||||
// tokenize the prompts and trim
|
// tokenize the prompts and trim
|
||||||
std::vector<std::vector<int32_t>> inputs;
|
std::vector<std::vector<int32_t>> inputs;
|
||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = ::llama_tokenize(ctx, prompt, true);
|
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
inp.resize(n_batch);
|
inp.resize(n_batch);
|
||||||
}
|
}
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add eos if not present
|
||||||
|
for (auto & inp : inputs) {
|
||||||
|
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||||
|
inp.push_back(llama_token_eos(model));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||||
@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
for (int i = 0; i < std::min(16, n_embd); i++) {
|
for (int i = 0; i < std::min(16, n_embd); i++) {
|
||||||
fprintf(stdout, "%f ", emb[j * n_embd + i]);
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user