mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
Final touches
This commit is contained in:
parent
775328064e
commit
319cdb3e1f
@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
|||||||
In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
|
In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
|
||||||
- I don't know yet how much the quantization affects the quality of the generated text
|
- I don't know yet how much the quantization affects the quality of the generated text
|
||||||
- Probably the token sampling can be improved
|
- Probably the token sampling can be improved
|
||||||
- No Windows support
|
|
||||||
- x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon
|
- x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon
|
||||||
|
|
||||||
|
1
main.cpp
1
main.cpp
@ -728,6 +728,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (embd.back() == 2) {
|
if (embd.back() == 2) {
|
||||||
|
printf(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
0
models/.gitignore
vendored
Normal file
0
models/.gitignore
vendored
Normal file
54
utils.cpp
54
utils.cpp
@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
||||||
auto res = gpt_tokenize(vocab, text);
|
//auto res = gpt_tokenize(vocab, text);
|
||||||
|
|
||||||
if (bos) {
|
|
||||||
res.insert(res.begin(), 1); // TODO: replace with vocab.bos
|
|
||||||
}
|
|
||||||
|
|
||||||
//std::vector<gpt_vocab::id> res;
|
|
||||||
|
|
||||||
//if (bos) {
|
//if (bos) {
|
||||||
// res.push_back(1); // TODO: replace with vocab.bos
|
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
|
||||||
//}
|
//}
|
||||||
|
|
||||||
// find the longest token that matches the text
|
std::vector<gpt_vocab::id> res;
|
||||||
//int pos = 0;
|
|
||||||
//while (true) {
|
|
||||||
// int l = 0;
|
|
||||||
// int t = 0;
|
|
||||||
// for (const auto & kv : vocab.id_to_token) {
|
|
||||||
// if (kv.second.size() < l) continue;
|
|
||||||
// if (kv.second.size() > text.size() - pos) continue;
|
|
||||||
// if (text.substr(pos, kv.second.size()) == kv.second) {
|
|
||||||
// l = kv.second.size();
|
|
||||||
// t = kv.first;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if (l == 0 && t != 13) {
|
if (bos) {
|
||||||
// break;
|
res.push_back(1); // TODO: replace with vocab.bos
|
||||||
// }
|
}
|
||||||
|
|
||||||
// res.push_back(t);
|
//find the longest token that matches the text
|
||||||
// pos += l;
|
int pos = 0;
|
||||||
//}
|
while (true) {
|
||||||
|
int l = 0;
|
||||||
|
int t = 0;
|
||||||
|
for (const auto & kv : vocab.id_to_token) {
|
||||||
|
if (kv.second.size() < l) continue;
|
||||||
|
if (kv.second.size() > text.size() - pos) continue;
|
||||||
|
if (text.substr(pos, kv.second.size()) == kv.second) {
|
||||||
|
l = kv.second.size();
|
||||||
|
t = kv.first;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (l == 0 && t != 13) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
res.push_back(t);
|
||||||
|
pos += l;
|
||||||
|
}
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
6
utils.h
6
utils.h
@ -15,12 +15,12 @@
|
|||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
int32_t n_predict = 200; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 100;
|
int32_t top_k = 40;
|
||||||
float top_p = 0.95f;
|
float top_p = 0.95f;
|
||||||
float temp = 0.8f;
|
float temp = 0.80f;
|
||||||
|
|
||||||
int32_t n_batch = 8; // batch size for prompt processing
|
int32_t n_batch = 8; // batch size for prompt processing
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user