mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
cmpnct_gpt2bpe.hpp : comments
This commit is contained in:
parent
278ada9572
commit
db5618ad99
@ -11,12 +11,15 @@
|
|||||||
#include <queue>
|
#include <queue>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
//-----
|
||||||
|
// Unicode GPT2 Byte Pair Encoding Tokenizer
|
||||||
|
// Adapted from https://github.com/cmp-nct/ggllm.cpp
|
||||||
|
//-----
|
||||||
|
|
||||||
/**
|
// Unicode library (from cmpnct_unicode.cpp)
|
||||||
* https://github.com/cmp-nct/ggllm.cpp
|
|
||||||
* Minimal library for high performance handling and categorization of UTF8 strings and characters
|
// Minimal library for high performance handling and categorization of UTF8 strings and characters
|
||||||
* Using std::string
|
// Using std::string
|
||||||
*/
|
|
||||||
|
|
||||||
enum CNCTCharType {
|
enum CNCTCharType {
|
||||||
DIGIT, // a numerical char in any language
|
DIGIT, // a numerical char in any language
|
||||||
@ -367,7 +370,7 @@ bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ported from libfalcon.cpp (https://github.com/cmp-nct/ggllm.cpp)
|
// llama.cpp GPT2 vocab (from libfalcon.cpp)
|
||||||
|
|
||||||
std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
|
std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
|
||||||
size_t start_pos = 0;
|
size_t start_pos = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user