mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
This commit is contained in:
parent
c5b49360d0
commit
233fc1c69f
@ -6342,7 +6342,6 @@ private:
|
|||||||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
||||||
const std::string & utf_char = text_utf[i];
|
const std::string & utf_char = text_utf[i];
|
||||||
bool split_condition = false;
|
bool split_condition = false;
|
||||||
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
|
||||||
int bytes_remain = text_utf.size() - i;
|
int bytes_remain = text_utf.size() - i;
|
||||||
// forward backward lookups
|
// forward backward lookups
|
||||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
||||||
@ -6368,9 +6367,9 @@ private:
|
|||||||
if (!split_condition && bytes_remain >= 3) {
|
if (!split_condition && bytes_remain >= 3) {
|
||||||
// 're|'ve|'ll
|
// 're|'ve|'ll
|
||||||
if (utf_char == "\'" && (
|
if (utf_char == "\'" && (
|
||||||
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
||||||
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
||||||
(utf_char_next == "l" || utf_char_next_next == "l"))
|
(utf_char_next == "l" && utf_char_next_next == "l"))
|
||||||
) {
|
) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
@ -6421,7 +6420,7 @@ private:
|
|||||||
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|||||||
{ " Hello" , { 258, 23090, }, },
|
{ " Hello" , { 258, 23090, }, },
|
||||||
{ " Hello" , { 466, 23090, }, },
|
{ " Hello" , { 466, 23090, }, },
|
||||||
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||||
|
{ "\n =" , { 1212, 40, }, },
|
||||||
|
{ "' era" , { 18, 4932, }, },
|
||||||
};
|
};
|
||||||
|
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
@ -41,6 +41,8 @@ tests = [
|
|||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
|
|||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s)
|
res = tokenizer.encode(s)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
@ -174,10 +174,8 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
@ -81,15 +81,14 @@ fname_tok = args.fname_tok
|
|||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s, add_bos=True)
|
res = tokenizer.encode(s, add_bos=True)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
Loading…
Reference in New Issue
Block a user