mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 05:48:47 +01:00
gguf : make token scores and types optional (#3347)
This commit is contained in:
parent
2619109ad5
commit
ecf90b1a51
@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
|
|||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||||
if not tokenizer_json_file.is_file():
|
if not tokenizer_json_file.is_file():
|
||||||
@ -177,12 +175,8 @@ for i in range(vocab_size):
|
|||||||
text = bytearray(pad_token)
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(0.0) # dymmy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
|
|||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||||
if not tokenizer_json_file.is_file():
|
if not tokenizer_json_file.is_file():
|
||||||
@ -161,12 +159,8 @@ for i in range(vocab_size):
|
|||||||
text = bytearray(pad_token)
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(0.0) # dymmy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
18
llama.cpp
18
llama.cpp
@ -1931,20 +1931,18 @@ static void llm_load_vocab(
|
|||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const float * scores = nullptr;
|
||||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||||
if (score_idx == -1) {
|
if (score_idx != -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
const int * toktypes = nullptr;
|
||||||
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||||
if (toktype_idx == -1) {
|
if (toktype_idx != -1) {
|
||||||
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
@ -2012,8 +2010,8 @@ static void llm_load_vocab(
|
|||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
token_data.score = scores[i];
|
token_data.score = scores ? scores[i] : 0.0f;
|
||||||
token_data.type = (llama_token_type) toktypes[i];
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
|
Loading…
Reference in New Issue
Block a user