mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
sentencepiece bpe compatible tokenizer (#252)
* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
This commit is contained in:
parent
5cb63e2493
commit
074bea2eb1
2
Makefile
2
Makefile
@ -31,7 +31,7 @@ endif
|
|||||||
#
|
#
|
||||||
|
|
||||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
|
@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||||
|
|
||||||
|
**TEMPORARY NOTICE:**
|
||||||
|
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal is to run the model using 4-bit quantization on a MacBook
|
The main goal is to run the model using 4-bit quantization on a MacBook
|
||||||
|
@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
|
|||||||
|
|
||||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||||
values = [
|
values = [
|
||||||
0x67676d6c, # magic: ggml in hex
|
0x67676d66, # magic: ggml in hex
|
||||||
|
1, # file version
|
||||||
*[hparams[key] for key in keys],
|
*[hparams[key] for key in keys],
|
||||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||||
ftype
|
ftype
|
||||||
@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
|
|||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(struct.pack("i", len(text)))
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||||
|
|
||||||
def process_and_write_variables(fout, model, ftype):
|
def process_and_write_variables(fout, model, ftype):
|
||||||
|
|
||||||
|
21
main.cpp
21
main.cpp
@ -3,6 +3,7 @@
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@ -105,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
fin.read((char *) &magic, sizeof(magic));
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 0x67676d6c) {
|
if (magic == 0x67676d6c) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||||
|
__func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (magic != 0x67676d66) {
|
||||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t format_version;
|
||||||
|
fin.read((char *) &format_version, sizeof(format_version));
|
||||||
|
|
||||||
|
if (format_version != 1) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
|
||||||
|
__func__, fname.c_str(), format_version);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_ff = 0;
|
int n_ff = 0;
|
||||||
@ -154,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
word.resize(len);
|
word.resize(len);
|
||||||
fin.read((char *) word.data(), len);
|
fin.read((char *) word.data(), len);
|
||||||
|
|
||||||
|
float score;
|
||||||
|
fin.read((char *) &score, sizeof(score));
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
vocab.score[i] = score;
|
||||||
|
|
||||||
//if (i < 30000) {
|
//if (i < 30000) {
|
||||||
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
||||||
|
24
quantize.cpp
24
quantize.cpp
@ -3,6 +3,7 @@
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
|||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
finp.read((char *) &magic, sizeof(magic));
|
finp.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 0x67676d6c) {
|
if (magic == 0x67676d6c) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||||
|
__func__, fname_inp.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (magic != 0x67676d66) {
|
||||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fout.write((char *) &magic, sizeof(magic));
|
fout.write((char *) &magic, sizeof(magic));
|
||||||
|
|
||||||
|
uint32_t format_version;
|
||||||
|
finp.read((char *) &format_version, sizeof(format_version));
|
||||||
|
|
||||||
|
if (format_version != 1) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
|
||||||
|
__func__, fname_inp.c_str(), format_version);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fout.write((char *) &format_version, sizeof(format_version));
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_hparams hparams;
|
llama_hparams hparams;
|
||||||
@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
|||||||
finp.read ((char *) word.data(), len);
|
finp.read ((char *) word.data(), len);
|
||||||
fout.write((char *) word.data(), len);
|
fout.write((char *) word.data(), len);
|
||||||
|
|
||||||
|
float score;
|
||||||
|
finp.read ((char *) &score, sizeof(score));
|
||||||
|
fout.write((char *) &score, sizeof(score));
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
vocab.score[i] = score;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
167
utils.cpp
167
utils.cpp
@ -6,6 +6,7 @@
|
|||||||
#include <regex>
|
#include <regex>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
#include <queue>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
@ -294,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Calculate this constant from the vocabulary
|
static size_t utf8_len(char src) {
|
||||||
#define MAX_TOKEN_LEN 18
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||||
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
return lookup[highbits];
|
||||||
std::vector<gpt_vocab::id> res;
|
}
|
||||||
std::vector<int> score;
|
|
||||||
std::vector<gpt_vocab::id> prev;
|
|
||||||
int len = text.length();
|
|
||||||
|
|
||||||
score.resize(len + 1);
|
struct llama_sp_symbol {
|
||||||
prev.resize(len + 1);
|
using index = int;
|
||||||
|
index prev;
|
||||||
|
index next;
|
||||||
|
std::string_view text;
|
||||||
|
};
|
||||||
|
|
||||||
// Forward pass
|
struct llama_sp_bigram {
|
||||||
for (int i = 0; i < len; i++) {
|
struct comparator {
|
||||||
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
||||||
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
|
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
|
||||||
auto sub = text.substr(i, sub_len);
|
}
|
||||||
auto token = vocab.token_to_id.find(sub);
|
};
|
||||||
if (token != vocab.token_to_id.end()) {
|
using queue_storage = std::vector<llama_sp_bigram>;
|
||||||
int token_score = sub.length() * sub.length();
|
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
|
||||||
int local_score = score[i] + token_score;
|
llama_sp_symbol::index left;
|
||||||
int next = i + sub_len;
|
llama_sp_symbol::index right;
|
||||||
if (score[next] < local_score) {
|
float score;
|
||||||
score[next] = local_score;
|
size_t size;
|
||||||
prev[next] = (*token).second;
|
};
|
||||||
|
|
||||||
|
struct llama_tokenizer {
|
||||||
|
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
|
||||||
|
|
||||||
|
void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
|
||||||
|
// split string into utf8 chars
|
||||||
|
int index = 0;
|
||||||
|
while (!text.empty()) {
|
||||||
|
llama_sp_symbol sym;
|
||||||
|
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
|
||||||
|
sym.text = std::string_view(text.data(), char_len);
|
||||||
|
sym.prev = index - 1;
|
||||||
|
text.remove_prefix(char_len);
|
||||||
|
sym.next = text.empty() ? -1 : index + 1;
|
||||||
|
index++;
|
||||||
|
symbols_.emplace_back(std::move(sym));
|
||||||
|
}
|
||||||
|
|
||||||
|
// seed the work queue with all possible 2-character tokens.
|
||||||
|
for (size_t i = 1; i < symbols_.size(); ++i) {
|
||||||
|
try_add_bigram(i - 1, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep substituting the highest frequency pairs for as long as we can.
|
||||||
|
while (!work_queue_.empty()) {
|
||||||
|
auto bigram = work_queue_.top();
|
||||||
|
work_queue_.pop();
|
||||||
|
|
||||||
|
auto & left_sym = symbols_[bigram.left];
|
||||||
|
auto & right_sym = symbols_[bigram.right];
|
||||||
|
|
||||||
|
// if one of the symbols already got merged, skip it.
|
||||||
|
if (left_sym.text.empty() || right_sym.text.empty() ||
|
||||||
|
left_sym.text.size() + right_sym.text.size() != bigram.size) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge the right sym into the left one
|
||||||
|
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
|
||||||
|
right_sym.text = std::string_view("");
|
||||||
|
|
||||||
|
// remove the right sym from the chain
|
||||||
|
left_sym.next = right_sym.next;
|
||||||
|
if (right_sym.next >= 0) {
|
||||||
|
symbols_[right_sym.next].prev = bigram.left;
|
||||||
|
}
|
||||||
|
|
||||||
|
// find more substitutions
|
||||||
|
try_add_bigram(left_sym.prev, bigram.left);
|
||||||
|
try_add_bigram(bigram.left, left_sym.next);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i != -1; i = symbols_[i].next) {
|
||||||
|
auto& symbol = symbols_[i];
|
||||||
|
auto token = vocab_.token_to_id.find(std::string(symbol.text));
|
||||||
|
|
||||||
|
if (token == vocab_.token_to_id.end()) {
|
||||||
|
// output any symbols that did not form tokens as bytes.
|
||||||
|
for (int j = 0; j < symbol.text.size(); ++j) {
|
||||||
|
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||||
|
output.push_back(token_id);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
output.push_back((*token).second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Backward pass
|
private:
|
||||||
int i = len;
|
void try_add_bigram(int left, int right) {
|
||||||
while (i > 0) {
|
if (left == -1 || right == -1) {
|
||||||
gpt_vocab::id token_id = prev[i];
|
return;
|
||||||
if (token_id == 0) {
|
|
||||||
// TODO: Return error or something more meaningful
|
|
||||||
printf("failed to tokenize string!\n");
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
res.push_back(token_id);
|
|
||||||
auto token = (*vocab.id_to_token.find(token_id)).second;
|
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
|
||||||
i -= token.length();
|
auto token = vocab_.token_to_id.find(std::string(text));
|
||||||
|
|
||||||
|
if (token == vocab_.token_to_id.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto score = vocab_.score.find((*token).second);
|
||||||
|
|
||||||
|
if (score == vocab_.score.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sp_bigram bigram;
|
||||||
|
bigram.left = left;
|
||||||
|
bigram.right = right;
|
||||||
|
bigram.score = (*score).second;
|
||||||
|
bigram.size = text.size();
|
||||||
|
work_queue_.push(bigram);
|
||||||
|
}
|
||||||
|
|
||||||
|
const gpt_vocab & vocab_;
|
||||||
|
std::vector<llama_sp_symbol> symbols_;
|
||||||
|
llama_sp_bigram::queue work_queue_;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
|
||||||
|
llama_tokenizer tokenizer(vocab);
|
||||||
|
std::vector<gpt_vocab::id> output;
|
||||||
|
|
||||||
|
if (text.size() == 0) {
|
||||||
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bos) {
|
if (bos) {
|
||||||
res.push_back(1); // TODO: replace with vocab.bos
|
output.push_back(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pieces are in reverse order so correct that
|
tokenizer.tokenize(text, output);
|
||||||
std::reverse(res.begin(), res.end());
|
return output;
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
||||||
|
3
utils.h
3
utils.h
@ -58,6 +58,7 @@ struct gpt_vocab {
|
|||||||
|
|
||||||
std::map<token, id> token_to_id;
|
std::map<token, id> token_to_id;
|
||||||
std::map<id, token> id_to_token;
|
std::map<id, token> id_to_token;
|
||||||
|
std::map<id, float> score;
|
||||||
};
|
};
|
||||||
|
|
||||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||||
@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||||||
|
|
||||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||||
// ref: https://github.com/google/sentencepiece
|
// ref: https://github.com/google/sentencepiece
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
|
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
|
||||||
|
|
||||||
// load the tokens from encoder.json
|
// load the tokens from encoder.json
|
||||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user