diff --git a/llama.cpp b/llama.cpp index e06c851ad..8818c6928 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2310,16 +2310,17 @@ struct llama_vocab { id special_cls_id = -1; id special_mask_id = -1; - int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. - int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. - id linefeed_id = 13; id special_prefix_id = -1; id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token - bool add_space_prefix = true; + // tokenizer flags + bool tokenizer_add_space_prefix = true; + bool tokenizer_add_bos = false; + bool tokenizer_add_eos = false; + bool tokenizer_ignore_merges = false; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { GGML_ASSERT(token_left.find(' ') == std::string::npos); @@ -4770,7 +4771,7 @@ static void llm_load_vocab( const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { - vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; @@ -4783,13 +4784,13 @@ static void llm_load_vocab( vocab.special_pad_id = 0; vocab.special_cls_id = 101; vocab.special_mask_id = 103; - vocab.add_space_prefix = false; + vocab.tokenizer_add_space_prefix = false; } else if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { - vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // read bpe merges and populate bpe ranks @@ -4847,6 +4848,8 @@ static void llm_load_vocab( tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + vocab.tokenizer_ignore_merges = true; + vocab.tokenizer_add_bos = true; } else if ( tokenizer_pre == "deepseek-llm") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; @@ -4897,6 +4900,14 @@ static void llm_load_vocab( } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } + } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_eos = false; + } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_eos = false; } else { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } @@ -5041,10 +5052,10 @@ static void llm_load_vocab( bool temp = true; if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { - vocab.special_add_bos = int(temp); + vocab.tokenizer_add_bos = temp; } if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { - vocab.special_add_eos = int(temp); + vocab.tokenizer_add_eos = temp; } } @@ -5144,7 +5155,7 @@ static void llm_load_vocab( ); // set attributes by model/tokenizer name - if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : vocab.cache_special_tokens) { @@ -13158,112 +13169,142 @@ struct llm_bigram_bpe { }; struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) { + GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); + switch (vocab.type_pre) { + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + regex_exprs = { + // original regex from tokenizer.json + //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + + // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + case LLAMA_VOCAB_PRE_TYPE_SMAUG: + regex_exprs = { + // same as llama3 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + regex_exprs = { + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + regex_exprs = { + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_FALCON: + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|`]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "[0-9][0-9][0-9]", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_MPT: + // TODO: MPT pre-tokenization regexes are unknown + // the following are close, but not exact. run the following: + // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf + GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed"); + regex_exprs = { + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + regex_exprs = { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_GPT2: + case LLAMA_VOCAB_PRE_TYPE_OLMO: + regex_exprs = { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_STABLELM2: + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + regex_exprs = { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_PORO: + regex_exprs = { + " ?[^(\\s|.,!?…。,、।۔،)]+", + }; + break; + default: + // default regex for BPE tokenization pre-processing + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }; + break; + } + } + + void append(const llama_vocab::id token_id, std::vector & output) const { + output.push_back(token_id); + } + + bool append_bos(std::vector & output) const { + if (vocab.tokenizer_add_bos) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + return true; + } + return false; + } + + bool append_eos(std::vector & output) const { + if (vocab.tokenizer_add_eos) { + GGML_ASSERT(vocab.special_eos_id != -1); + output.push_back(vocab.special_eos_id); + return true; + } + return false; + } + + void check_double_bos_eos(const std::vector & output) const { + if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) { + LLAMA_LOG_WARN( + "%s: Added a EOS token to the prompt as specified by the model but the prompt " + "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + } void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - bool ignore_merges = false; - std::vector word_collection; - switch (vocab.type) { - case LLAMA_VOCAB_TYPE_BPE: - switch (vocab.type_pre) { - case LLAMA_VOCAB_PRE_TYPE_LLAMA3: - ignore_merges = true; - word_collection = unicode_regex_split(text, { - // original regex from tokenizer.json - //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - - // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DBRX: - case LLAMA_VOCAB_PRE_TYPE_SMAUG: - word_collection = unicode_regex_split(text, { - // same as llama3 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: - word_collection = unicode_regex_split(text, { - "[\r\n]", - "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", - "\\s?[!-/:-~!-/:-~‘-‟ -。]+", - "\\s+$", - "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: - word_collection = unicode_regex_split(text, { - "[\r\n]", - "\\s?\\p{L}+", - "\\s?\\p{P}+", - "[一-龥ࠀ-一가-퟿]+", - "\\p{N}", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_FALCON: - word_collection = unicode_regex_split(text, { - "[\\p{P}\\$\\+<=>\\^~\\|]+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "[0-9][0-9][0-9]", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_MPT: - // TODO: MPT pre-tokenization regexes are unknown - // the following are close, but not exact. run the following: - // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf - GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed"); - word_collection = unicode_regex_split(text, { - "\\s?\\p{L}+", - "\\s?\\p{P}+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_STARCODER: - case LLAMA_VOCAB_PRE_TYPE_REFACT: - case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: - word_collection = unicode_regex_split(text, { - "\\p{N}", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_GPT2: - case LLAMA_VOCAB_PRE_TYPE_OLMO: - word_collection = unicode_regex_split(text, { - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_STABLELM2: - case LLAMA_VOCAB_PRE_TYPE_QWEN2: - word_collection = unicode_regex_split(text, { - // original regex from tokenizer.json - // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_PORO: - word_collection = unicode_regex_split(text, { - " ?[^(\\s|.,!?…。,、।۔،)]+", - }); - break; - default: - // default regex for BPE tokenization pre-processing - word_collection = unicode_regex_split(text, { - "[\\p{P}\\$\\+<=>\\^~\\|]+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", - "[0-9][0-9][0-9]", - }); - break; - } - break; - default: - GGML_ASSERT(false); - break; - } + const auto word_collection = unicode_regex_split(text, regex_exprs); symbols_final.clear(); @@ -13274,7 +13315,7 @@ struct llm_tokenizer_bpe { int index = 0; size_t offset = 0; - if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); offset = word.size(); } @@ -13355,10 +13396,9 @@ struct llm_tokenizer_bpe { for (auto j = str.begin(); j != str.end(); ++j) { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); - if (token_multibyte == vocab.token_to_id.end()) { - throw std::runtime_error("ERROR: byte not found in vocab"); + if (token_multibyte != vocab.token_to_id.end()) { + output.push_back(token_multibyte->second); } - output.push_back((*token_multibyte).second); } } else { output.push_back((*token).second); @@ -13397,6 +13437,8 @@ private: const llama_vocab & vocab; + std::vector regex_exprs; + std::vector symbols; std::vector symbols_final; @@ -13677,7 +13719,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & bool is_prev_special = false; - if (add_special && vocab.special_add_bos != 0) { + if (add_special && vocab.tokenizer_add_bos) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); is_prev_special = true; @@ -13687,7 +13729,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (vocab.add_space_prefix) { + if (vocab.tokenizer_add_space_prefix) { if (!output.size() || is_prev_special) { // prefix with space if first token raw_text = " " + raw_text; } @@ -13705,23 +13747,24 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } - if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { LLAMA_LOG_WARN( "%s: Added a BOS token to the prompt as specified by the model but the prompt " "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " "Are you sure this is what you want?\n", __FUNCTION__); } - if (add_special && vocab.special_add_eos == 1) { + if (add_special && vocab.tokenizer_add_eos) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); } } break; case LLAMA_VOCAB_TYPE_BPE: { - if (add_special && vocab.special_add_bos != 0) { - GGML_ASSERT(vocab.special_bos_id != -1); - output.push_back(vocab.special_bos_id); + llm_tokenizer_bpe tokenizer(vocab); + + if (add_special) { + tokenizer.append_bos(output); } for (const auto & fragment : fragment_buffer) { @@ -13731,23 +13774,15 @@ static std::vector llama_tokenize_internal(const llama_vocab & #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - output.push_back(fragment.token); + tokenizer.append(fragment.token, output); } } - if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { - LLAMA_LOG_WARN( - "%s: Added a BOS token to the prompt as specified by the model but the prompt " - "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " - "Are you sure this is what you want?\n", __FUNCTION__); - } - - if (add_special && vocab.special_add_eos == 1) { - GGML_ASSERT(vocab.special_add_eos != -1); - output.push_back(vocab.special_eos_id); + if (add_special) { + tokenizer.append_eos(output); + tokenizer.check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: @@ -18320,11 +18355,11 @@ llama_token llama_token_nl(const struct llama_model * model) { } int32_t llama_add_bos_token(const struct llama_model * model) { - return model->vocab.special_add_bos; + return model->vocab.tokenizer_add_bos; } int32_t llama_add_eos_token(const struct llama_model * model) { - return model->vocab.special_add_eos; + return model->vocab.tokenizer_add_eos; } llama_token llama_token_prefix(const struct llama_model * model) { diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 744873c2a..890e4d7c2 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -1,83 +1,143 @@ -import regex -import ctypes +import array import unicodedata - - -class CoodepointFlags (ctypes.Structure): - _fields_ = [ # see definition in unicode.h - ("is_undefined", ctypes.c_uint16, 1), - ("is_number", ctypes.c_uint16, 1), # regex: \p{N} - ("is_letter", ctypes.c_uint16, 1), # regex: \p{L} - ("is_separator", ctypes.c_uint16, 1), # regex: \p{Z} - ("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M} - ("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P} - ("is_symbol", ctypes.c_uint16, 1), # regex: \p{S} - ("is_control", ctypes.c_uint16, 1), # regex: \p{C} - ] - - -assert (ctypes.sizeof(CoodepointFlags) == 2) +import requests MAX_CODEPOINTS = 0x110000 -regex_number = regex.compile(r'\p{N}') -regex_letter = regex.compile(r'\p{L}') -regex_separator = regex.compile(r'\p{Z}') -regex_accent_mark = regex.compile(r'\p{M}') -regex_punctuation = regex.compile(r'\p{P}') -regex_symbol = regex.compile(r'\p{S}') -regex_control = regex.compile(r'\p{C}') -regex_whitespace = regex.compile(r'\s') +UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" -codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)() + +# see https://www.unicode.org/L2/L1999/UnicodeData.html +def unicode_data_iter(): + res = requests.get(UNICODE_DATA_URL) + res.raise_for_status() + data = res.content.decode() + + prev = [] + + for line in data.splitlines(): + # ej: 0000;;Cc;0;BN;;;;;N;NULL;;;; + line = line.split(";") + + cpt = int(line[0], base=16) + assert cpt < MAX_CODEPOINTS + + cpt_lower = int(line[-2] or "0", base=16) + assert cpt_lower < MAX_CODEPOINTS + + cpt_upper = int(line[-3] or "0", base=16) + assert cpt_upper < MAX_CODEPOINTS + + categ = line[2].strip() + assert len(categ) == 2 + + bidir = line[4].strip() + assert len(categ) == 2 + + name = line[1] + if name.endswith(", First>"): + prev = (cpt, cpt_lower, cpt_upper, categ, bidir) + continue + if name.endswith(", Last>"): + assert prev[1:] == (0, 0, categ, bidir) + for c in range(prev[0], cpt): + yield (c, cpt_lower, cpt_upper, categ, bidir) + + yield (cpt, cpt_lower, cpt_upper, categ, bidir) + + +# see definition in unicode.h +CODEPOINT_FLAG_UNDEFINED = 0x0001 # +CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N} +CODEPOINT_FLAG_LETTER = 0x0004 # \p{L} +CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z} +CODEPOINT_FLAG_MARK = 0x0010 # \p{M} +CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P} +CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S} +CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C} + +UNICODE_CATEGORY_TO_FLAG = { + "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined + "Cc": CODEPOINT_FLAG_CONTROL, # Control + "Cf": CODEPOINT_FLAG_CONTROL, # Format + "Co": CODEPOINT_FLAG_CONTROL, # Private Use + "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate + "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter + "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter + "Lo": CODEPOINT_FLAG_LETTER, # Other Letter + "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter + "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter + "L&": CODEPOINT_FLAG_LETTER, # Cased Letter + "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark + "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark + "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark + "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number + "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number + "No": CODEPOINT_FLAG_NUMBER, # Other Number + "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation + "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation + "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation + "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation + "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation + "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation + "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation + "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol + "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol + "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol + "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol + "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator + "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator + "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator +} + + +codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS table_whitespace = [] table_lowercase = [] table_uppercase = [] table_nfd = [] -for codepoint in range(MAX_CODEPOINTS): +for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter(): # convert codepoint to unicode character - char = chr(codepoint) + char = chr(cpt) - # regex categories - flags = codepoint_flags[codepoint] - flags.is_number = bool(regex_number.match(char)) - flags.is_letter = bool(regex_letter.match(char)) - flags.is_separator = bool(regex_separator.match(char)) - flags.is_accent_mark = bool(regex_accent_mark.match(char)) - flags.is_punctuation = bool(regex_punctuation.match(char)) - flags.is_symbol = bool(regex_symbol.match(char)) - flags.is_control = bool(regex_control.match(char)) - flags.is_undefined = bytes(flags)[0] == 0 - assert (not flags.is_undefined) - - # whitespaces - if bool(regex_whitespace.match(char)): - table_whitespace.append(codepoint) + # codepoint category flags + codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ] # lowercase conversion - lower = ord(char.lower()[0]) - if codepoint != lower: - table_lowercase.append((codepoint, lower)) + if cpt_lower: + table_lowercase.append((cpt, cpt_lower)) # uppercase conversion - upper = ord(char.upper()[0]) - if codepoint != upper: - table_uppercase.append((codepoint, upper)) + if cpt_upper: + table_uppercase.append((cpt, cpt_upper)) # NFD normalization norm = ord(unicodedata.normalize('NFD', char)[0]) - if codepoint != norm: - table_nfd.append((codepoint, norm)) + if cpt != norm: + table_nfd.append((cpt, norm)) + + +# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +table_whitespace.extend(range(0x0009, 0x000D + 1)) +table_whitespace.extend(range(0x2000, 0x200A + 1)) +table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) + + +# sort by codepoint +table_whitespace.sort() +table_lowercase.sort() +table_uppercase.sort() +table_nfd.sort() # group ranges with same flags ranges_flags = [(0, codepoint_flags[0])] # start, flags for codepoint, flags in enumerate(codepoint_flags): - if bytes(flags) != bytes(ranges_flags[-1][1]): + if flags != ranges_flags[-1][1]: ranges_flags.append((codepoint, flags)) -ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags())) +ranges_flags.append((MAX_CODEPOINTS, 0x0000)) # group ranges with same nfd @@ -90,8 +150,8 @@ for codepoint, norm in table_nfd: ranges_nfd[-1] = (start, codepoint, norm) -# Generate 'unicode-data.cpp' - +# Generate 'unicode-data.cpp': +# python ./scripts//gen-unicode-data.py > unicode-data.cpp def out(line=""): print(line, end='\n') # noqa @@ -110,12 +170,12 @@ out("""\ out("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") for codepoint, flags in ranges_flags: - flags = int.from_bytes(bytes(flags), "little") out("{0x%06X, 0x%04X}," % (codepoint, flags)) out("};\n") out("const std::unordered_set unicode_set_whitespace = {") -out(", ".join("0x%06X" % cpt for cpt in table_whitespace)) +for codepoint in table_whitespace: + out("0x%06X," % codepoint) out("};\n") out("const std::unordered_map unicode_map_lowercase = {") diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 52f589511..a07c52fb3 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -11,13 +11,15 @@ import logging import argparse import subprocess import random +import unicodedata from typing import Callable, Iterator import cffi from transformers import AutoTokenizer -logger = logging.getLogger("test-tokenizer-random-bpe") + +logger = logging.getLogger("test-tokenizer-random") class LibLlama: @@ -155,9 +157,14 @@ def generator_custom_text_edge_cases() -> Iterator[str]: 'Cửa Việt', # llama-3, ignore_merges = true 'a', # Phi-3 fail '<|endoftext|>', # Phi-3 fail - 'a\na', # TODO: Bert fail - 'a b', # rstrip phi-3 - 'a b', # lstrip jina-v2 + 'a\na', # bert fail + '"`', # falcon + ' \u2e4e', # falcon + 'a\xa0\xa0\x00b', # jina-v2-es + 'one ', # jina-v2-es lstrip=true + 'a b', # rstrip phi-3 + 'a b', # lstrip jina-v2 + '\xa0aC', # deepseek ] @@ -189,17 +196,23 @@ def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]: for m in range(iterations): rand.seed(m) words = rand.choices(all_tokens, k=500) - if words[0] == tokenizer.bos_token: # skip spam warning of double BOS + if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS words.pop(0) if tokenizer.add_bos_token: # drop all starting BOS words.pop(0) + if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS + while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS + words.pop(-1) + if tokenizer.add_bos_token: # drop all trailing EOS + words.pop(-1) yield "".join(words) def generator_random_chars(iterations=100) -> Iterator[str]: """Brute force random text with simple characters""" + NUM_WORDS = 400 WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) CHARS = list(sorted(set(""" ABCDEFGHIJKLMNOPQRSTUVWXYZ @@ -213,12 +226,50 @@ def generator_random_chars(iterations=100) -> Iterator[str]: for m in range(iterations): rand.seed(m) text = [] - num_words = rand.randint(300, 400) - for i in range(num_words): + for _ in range(NUM_WORDS): k = rand.randint(1, 7) word = rand.choices(CHARS, k=k) - space = rand.choice(WHITESPACES) - text.append("".join(word) + space) + word.append(rand.choice(WHITESPACES)) + text.append("".join(word)) + yield "".join(text) + + +def generator_unicodes() -> Iterator[str]: + """Iterate unicode characters""" + + MAX_CODEPOINTS = 0x30000 # 0x110000 + + def _valid(cpt): + if cpt >= 0x30000: # unassigned and supplement­ary + return False + if 0x00D800 <= cpt <= 0x00F8FF: # Surrogates + return False + if unicodedata.category(chr(cpt)) == "Cn": + return False + return True + + characters = [chr(cpt) for cpt in range(1, MAX_CODEPOINTS) if _valid(cpt)] + + yield from characters + + +def generator_random_unicodes(iterations=100) -> Iterator[str]: + """Brute force random text with unicode characters""" + + NUM_WORDS = 200 + WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) + + characters = list(generator_unicodes()) + + rand = random.Random() + for m in range(iterations): + rand.seed(m) + text = [] + for _ in range(NUM_WORDS): + k = rand.randint(1, 7) + word = rand.choices(characters, k=k) + word.append(rand.choice(WHITESPACES)) + text.append("".join(word)) yield "".join(text) @@ -256,25 +307,7 @@ def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[s yield "".join(text) -def generator_random_bytes(iterations=100) -> Iterator[str]: - """Brute force random bytes""" - - WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) - - rand = random.Random() - for m in range(iterations): - rand.seed(m) - text = [] - num_words = rand.randint(300, 400) - for i in range(num_words): - k = rand.randint(1, 8) - word = [chr(r) for r in rand.randbytes(k) if r] - word.append(rand.choice(WHITESPACES)) - text.append("".join(word)) - yield "".join(text) - - -def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]): +def compare_tokenizers(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]): def find_first_mismatch(ids1: list[int], ids2: list[int]): for i, (a, b) in enumerate(zip(ids1, ids2)): @@ -284,20 +317,34 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g return -1 return min(len(ids1), len(ids2)) - t0 = time.perf_counter() + t_tokenizer1 = 0 + t_tokenizer2 = 0 + t_start = time.perf_counter() + num_errors = 10 + logger.info("%s: %s" % (generator.__name__, "ini")) for text in generator: + # print(repr(text), hex(ord(text[0])), text.encode()) + t0 = time.perf_counter() ids1 = func_tokenize1(text) + t1 = time.perf_counter() ids2 = func_tokenize2(text) + t2 = time.perf_counter() + t_tokenizer1 += t1 - t0 + t_tokenizer2 += t2 - t1 if ids1 != ids2: i = find_first_mismatch(ids1, ids2) ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] - logger.info(" TokenIDs: " + str(ids1)) - logger.info(" Expected: " + str(ids2)) - raise Exception() - t1 = time.perf_counter() - logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0)) + logger.error(" TokenIDs: " + str(ids1)) + logger.error(" Expected: " + str(ids2)) + # raise Exception() + num_errors += 1 + if num_errors > 10: + break + + t_total = time.perf_counter() - t_start + logger.info("%s: end, tok1: %.3f tok2: %.3f total: %.3f" % (generator.__name__, t_tokenizer1, t_tokenizer2, t_total)) def main(argv: list[str] = None): @@ -307,7 +354,8 @@ def main(argv: list[str] = None): parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(argv) - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO) + logger.info(f"VOCABFILE: '{args.vocab_file}'") model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer) @@ -321,18 +369,22 @@ def main(argv: list[str] = None): ids = func_tokenize2("a") assert 1 <= len(ids) <= 3 add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0] + add_eos_token = len(ids) > 1 and tokenizer.eos_token_id == ids[-1] tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token) + tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", add_eos_token) vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer)) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000)) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000)) - # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL + + compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text()) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_unicodes()) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_unicodes(10_000)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) + compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000)) model.free() @@ -340,20 +392,40 @@ def main(argv: list[str] = None): if __name__ == "__main__": # main() + logging.basicConfig( + level = logging.DEBUG, + format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", + datefmt = "%Y-%m-%d %H:%M:%S", + filename = logger.name + ".log", + filemode = "a" + ) + path_tokenizers = "./models/tokenizers/" path_vocab_format = "./models/ggml-vocab-%s.gguf" # import os # tokenizers = os.listdir(path_tokenizers) tokenizers = [ - "llama-spm", # SPM - "phi-3", # SPM - "jina-v2-en", # WPM - "bert-bge", # WPM + # "llama-spm", # SPM + # "phi-3", # SPM + # "bert-bge", # WPM + # "jina-v2-en", # WPM + "gpt-2", # BPE + "llama-bpe", # BPE + "falcon", # BPE + "starcoder", # BPE + "jina-v2-es", # BPE + "jina-v2-de", # BPE + "jina-v2-code", # BPE + "smaug-bpe", # BPE + "phi-2", # BPE + "deepseek-coder", # BPE + "deepseek-llm", # BPE ] for tokenizer in tokenizers: - print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa + logger.info("=" * 50) + logger.info(f"TOKENIZER: '{tokenizer}'") vocab_file = path_vocab_format % tokenizer dir_tokenizer = path_tokenizers + "/" + tokenizer main([vocab_file, dir_tokenizer, "--verbose"]) diff --git a/unicode-data.cpp b/unicode-data.cpp index d7c1c898d..4a939898b 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -68,36 +68,36 @@ const std::vector> unicode_ranges_flags = { // st {0x000370, 0x0004}, {0x000375, 0x0040}, {0x000376, 0x0004}, -{0x000378, 0x0080}, +{0x000378, 0x0001}, {0x00037A, 0x0004}, {0x00037E, 0x0020}, {0x00037F, 0x0004}, -{0x000380, 0x0080}, +{0x000380, 0x0001}, {0x000384, 0x0040}, {0x000386, 0x0004}, {0x000387, 0x0020}, {0x000388, 0x0004}, -{0x00038B, 0x0080}, +{0x00038B, 0x0001}, {0x00038C, 0x0004}, -{0x00038D, 0x0080}, +{0x00038D, 0x0001}, {0x00038E, 0x0004}, -{0x0003A2, 0x0080}, +{0x0003A2, 0x0001}, {0x0003A3, 0x0004}, {0x0003F6, 0x0040}, {0x0003F7, 0x0004}, {0x000482, 0x0040}, {0x000483, 0x0010}, {0x00048A, 0x0004}, -{0x000530, 0x0080}, +{0x000530, 0x0001}, {0x000531, 0x0004}, -{0x000557, 0x0080}, +{0x000557, 0x0001}, {0x000559, 0x0004}, {0x00055A, 0x0020}, {0x000560, 0x0004}, {0x000589, 0x0020}, -{0x00058B, 0x0080}, +{0x00058B, 0x0001}, {0x00058D, 0x0040}, -{0x000590, 0x0080}, +{0x000590, 0x0001}, {0x000591, 0x0010}, {0x0005BE, 0x0020}, {0x0005BF, 0x0010}, @@ -107,12 +107,13 @@ const std::vector> unicode_ranges_flags = { // st {0x0005C4, 0x0010}, {0x0005C6, 0x0020}, {0x0005C7, 0x0010}, -{0x0005C8, 0x0080}, +{0x0005C8, 0x0001}, {0x0005D0, 0x0004}, -{0x0005EB, 0x0080}, +{0x0005EB, 0x0001}, {0x0005EF, 0x0004}, {0x0005F3, 0x0020}, -{0x0005F5, 0x0080}, +{0x0005F5, 0x0001}, +{0x000600, 0x0080}, {0x000606, 0x0040}, {0x000609, 0x0020}, {0x00060B, 0x0040}, @@ -145,16 +146,17 @@ const std::vector> unicode_ranges_flags = { // st {0x0006FD, 0x0040}, {0x0006FF, 0x0004}, {0x000700, 0x0020}, -{0x00070E, 0x0080}, +{0x00070E, 0x0001}, +{0x00070F, 0x0080}, {0x000710, 0x0004}, {0x000711, 0x0010}, {0x000712, 0x0004}, {0x000730, 0x0010}, -{0x00074B, 0x0080}, +{0x00074B, 0x0001}, {0x00074D, 0x0004}, {0x0007A6, 0x0010}, {0x0007B1, 0x0004}, -{0x0007B2, 0x0080}, +{0x0007B2, 0x0001}, {0x0007C0, 0x0002}, {0x0007CA, 0x0004}, {0x0007EB, 0x0010}, @@ -162,7 +164,7 @@ const std::vector> unicode_ranges_flags = { // st {0x0007F6, 0x0040}, {0x0007F7, 0x0020}, {0x0007FA, 0x0004}, -{0x0007FB, 0x0080}, +{0x0007FB, 0x0001}, {0x0007FD, 0x0010}, {0x0007FE, 0x0040}, {0x000800, 0x0004}, @@ -173,20 +175,22 @@ const std::vector> unicode_ranges_flags = { // st {0x000825, 0x0010}, {0x000828, 0x0004}, {0x000829, 0x0010}, -{0x00082E, 0x0080}, +{0x00082E, 0x0001}, {0x000830, 0x0020}, -{0x00083F, 0x0080}, +{0x00083F, 0x0001}, {0x000840, 0x0004}, {0x000859, 0x0010}, -{0x00085C, 0x0080}, +{0x00085C, 0x0001}, {0x00085E, 0x0020}, -{0x00085F, 0x0080}, +{0x00085F, 0x0001}, {0x000860, 0x0004}, -{0x00086B, 0x0080}, +{0x00086B, 0x0001}, {0x000870, 0x0004}, {0x000888, 0x0040}, {0x000889, 0x0004}, -{0x00088F, 0x0080}, +{0x00088F, 0x0001}, +{0x000890, 0x0080}, +{0x000892, 0x0001}, {0x000898, 0x0010}, {0x0008A0, 0x0004}, {0x0008CA, 0x0010}, @@ -205,35 +209,35 @@ const std::vector> unicode_ranges_flags = { // st {0x000970, 0x0020}, {0x000971, 0x0004}, {0x000981, 0x0010}, -{0x000984, 0x0080}, +{0x000984, 0x0001}, {0x000985, 0x0004}, -{0x00098D, 0x0080}, +{0x00098D, 0x0001}, {0x00098F, 0x0004}, -{0x000991, 0x0080}, +{0x000991, 0x0001}, {0x000993, 0x0004}, -{0x0009A9, 0x0080}, +{0x0009A9, 0x0001}, {0x0009AA, 0x0004}, -{0x0009B1, 0x0080}, +{0x0009B1, 0x0001}, {0x0009B2, 0x0004}, -{0x0009B3, 0x0080}, +{0x0009B3, 0x0001}, {0x0009B6, 0x0004}, -{0x0009BA, 0x0080}, +{0x0009BA, 0x0001}, {0x0009BC, 0x0010}, {0x0009BD, 0x0004}, {0x0009BE, 0x0010}, -{0x0009C5, 0x0080}, +{0x0009C5, 0x0001}, {0x0009C7, 0x0010}, -{0x0009C9, 0x0080}, +{0x0009C9, 0x0001}, {0x0009CB, 0x0010}, {0x0009CE, 0x0004}, -{0x0009CF, 0x0080}, +{0x0009CF, 0x0001}, {0x0009D7, 0x0010}, -{0x0009D8, 0x0080}, +{0x0009D8, 0x0001}, {0x0009DC, 0x0004}, -{0x0009DE, 0x0080}, +{0x0009DE, 0x0001}, {0x0009DF, 0x0004}, {0x0009E2, 0x0010}, -{0x0009E4, 0x0080}, +{0x0009E4, 0x0001}, {0x0009E6, 0x0002}, {0x0009F0, 0x0004}, {0x0009F2, 0x0040}, @@ -242,173 +246,173 @@ const std::vector> unicode_ranges_flags = { // st {0x0009FC, 0x0004}, {0x0009FD, 0x0020}, {0x0009FE, 0x0010}, -{0x0009FF, 0x0080}, +{0x0009FF, 0x0001}, {0x000A01, 0x0010}, -{0x000A04, 0x0080}, +{0x000A04, 0x0001}, {0x000A05, 0x0004}, -{0x000A0B, 0x0080}, +{0x000A0B, 0x0001}, {0x000A0F, 0x0004}, -{0x000A11, 0x0080}, +{0x000A11, 0x0001}, {0x000A13, 0x0004}, -{0x000A29, 0x0080}, +{0x000A29, 0x0001}, {0x000A2A, 0x0004}, -{0x000A31, 0x0080}, +{0x000A31, 0x0001}, {0x000A32, 0x0004}, -{0x000A34, 0x0080}, +{0x000A34, 0x0001}, {0x000A35, 0x0004}, -{0x000A37, 0x0080}, +{0x000A37, 0x0001}, {0x000A38, 0x0004}, -{0x000A3A, 0x0080}, +{0x000A3A, 0x0001}, {0x000A3C, 0x0010}, -{0x000A3D, 0x0080}, +{0x000A3D, 0x0001}, {0x000A3E, 0x0010}, -{0x000A43, 0x0080}, +{0x000A43, 0x0001}, {0x000A47, 0x0010}, -{0x000A49, 0x0080}, +{0x000A49, 0x0001}, {0x000A4B, 0x0010}, -{0x000A4E, 0x0080}, +{0x000A4E, 0x0001}, {0x000A51, 0x0010}, -{0x000A52, 0x0080}, +{0x000A52, 0x0001}, {0x000A59, 0x0004}, -{0x000A5D, 0x0080}, +{0x000A5D, 0x0001}, {0x000A5E, 0x0004}, -{0x000A5F, 0x0080}, +{0x000A5F, 0x0001}, {0x000A66, 0x0002}, {0x000A70, 0x0010}, {0x000A72, 0x0004}, {0x000A75, 0x0010}, {0x000A76, 0x0020}, -{0x000A77, 0x0080}, +{0x000A77, 0x0001}, {0x000A81, 0x0010}, -{0x000A84, 0x0080}, +{0x000A84, 0x0001}, {0x000A85, 0x0004}, -{0x000A8E, 0x0080}, +{0x000A8E, 0x0001}, {0x000A8F, 0x0004}, -{0x000A92, 0x0080}, +{0x000A92, 0x0001}, {0x000A93, 0x0004}, -{0x000AA9, 0x0080}, +{0x000AA9, 0x0001}, {0x000AAA, 0x0004}, -{0x000AB1, 0x0080}, +{0x000AB1, 0x0001}, {0x000AB2, 0x0004}, -{0x000AB4, 0x0080}, +{0x000AB4, 0x0001}, {0x000AB5, 0x0004}, -{0x000ABA, 0x0080}, +{0x000ABA, 0x0001}, {0x000ABC, 0x0010}, {0x000ABD, 0x0004}, {0x000ABE, 0x0010}, -{0x000AC6, 0x0080}, +{0x000AC6, 0x0001}, {0x000AC7, 0x0010}, -{0x000ACA, 0x0080}, +{0x000ACA, 0x0001}, {0x000ACB, 0x0010}, -{0x000ACE, 0x0080}, +{0x000ACE, 0x0001}, {0x000AD0, 0x0004}, -{0x000AD1, 0x0080}, +{0x000AD1, 0x0001}, {0x000AE0, 0x0004}, {0x000AE2, 0x0010}, -{0x000AE4, 0x0080}, +{0x000AE4, 0x0001}, {0x000AE6, 0x0002}, {0x000AF0, 0x0020}, {0x000AF1, 0x0040}, -{0x000AF2, 0x0080}, +{0x000AF2, 0x0001}, {0x000AF9, 0x0004}, {0x000AFA, 0x0010}, -{0x000B00, 0x0080}, +{0x000B00, 0x0001}, {0x000B01, 0x0010}, -{0x000B04, 0x0080}, +{0x000B04, 0x0001}, {0x000B05, 0x0004}, -{0x000B0D, 0x0080}, +{0x000B0D, 0x0001}, {0x000B0F, 0x0004}, -{0x000B11, 0x0080}, +{0x000B11, 0x0001}, {0x000B13, 0x0004}, -{0x000B29, 0x0080}, +{0x000B29, 0x0001}, {0x000B2A, 0x0004}, -{0x000B31, 0x0080}, +{0x000B31, 0x0001}, {0x000B32, 0x0004}, -{0x000B34, 0x0080}, +{0x000B34, 0x0001}, {0x000B35, 0x0004}, -{0x000B3A, 0x0080}, +{0x000B3A, 0x0001}, {0x000B3C, 0x0010}, {0x000B3D, 0x0004}, {0x000B3E, 0x0010}, -{0x000B45, 0x0080}, +{0x000B45, 0x0001}, {0x000B47, 0x0010}, -{0x000B49, 0x0080}, +{0x000B49, 0x0001}, {0x000B4B, 0x0010}, -{0x000B4E, 0x0080}, +{0x000B4E, 0x0001}, {0x000B55, 0x0010}, -{0x000B58, 0x0080}, +{0x000B58, 0x0001}, {0x000B5C, 0x0004}, -{0x000B5E, 0x0080}, +{0x000B5E, 0x0001}, {0x000B5F, 0x0004}, {0x000B62, 0x0010}, -{0x000B64, 0x0080}, +{0x000B64, 0x0001}, {0x000B66, 0x0002}, {0x000B70, 0x0040}, {0x000B71, 0x0004}, {0x000B72, 0x0002}, -{0x000B78, 0x0080}, +{0x000B78, 0x0001}, {0x000B82, 0x0010}, {0x000B83, 0x0004}, -{0x000B84, 0x0080}, +{0x000B84, 0x0001}, {0x000B85, 0x0004}, -{0x000B8B, 0x0080}, +{0x000B8B, 0x0001}, {0x000B8E, 0x0004}, -{0x000B91, 0x0080}, +{0x000B91, 0x0001}, {0x000B92, 0x0004}, -{0x000B96, 0x0080}, +{0x000B96, 0x0001}, {0x000B99, 0x0004}, -{0x000B9B, 0x0080}, +{0x000B9B, 0x0001}, {0x000B9C, 0x0004}, -{0x000B9D, 0x0080}, +{0x000B9D, 0x0001}, {0x000B9E, 0x0004}, -{0x000BA0, 0x0080}, +{0x000BA0, 0x0001}, {0x000BA3, 0x0004}, -{0x000BA5, 0x0080}, +{0x000BA5, 0x0001}, {0x000BA8, 0x0004}, -{0x000BAB, 0x0080}, +{0x000BAB, 0x0001}, {0x000BAE, 0x0004}, -{0x000BBA, 0x0080}, +{0x000BBA, 0x0001}, {0x000BBE, 0x0010}, -{0x000BC3, 0x0080}, +{0x000BC3, 0x0001}, {0x000BC6, 0x0010}, -{0x000BC9, 0x0080}, +{0x000BC9, 0x0001}, {0x000BCA, 0x0010}, -{0x000BCE, 0x0080}, +{0x000BCE, 0x0001}, {0x000BD0, 0x0004}, -{0x000BD1, 0x0080}, +{0x000BD1, 0x0001}, {0x000BD7, 0x0010}, -{0x000BD8, 0x0080}, +{0x000BD8, 0x0001}, {0x000BE6, 0x0002}, {0x000BF3, 0x0040}, -{0x000BFB, 0x0080}, +{0x000BFB, 0x0001}, {0x000C00, 0x0010}, {0x000C05, 0x0004}, -{0x000C0D, 0x0080}, +{0x000C0D, 0x0001}, {0x000C0E, 0x0004}, -{0x000C11, 0x0080}, +{0x000C11, 0x0001}, {0x000C12, 0x0004}, -{0x000C29, 0x0080}, +{0x000C29, 0x0001}, {0x000C2A, 0x0004}, -{0x000C3A, 0x0080}, +{0x000C3A, 0x0001}, {0x000C3C, 0x0010}, {0x000C3D, 0x0004}, {0x000C3E, 0x0010}, -{0x000C45, 0x0080}, +{0x000C45, 0x0001}, {0x000C46, 0x0010}, -{0x000C49, 0x0080}, +{0x000C49, 0x0001}, {0x000C4A, 0x0010}, -{0x000C4E, 0x0080}, +{0x000C4E, 0x0001}, {0x000C55, 0x0010}, -{0x000C57, 0x0080}, +{0x000C57, 0x0001}, {0x000C58, 0x0004}, -{0x000C5B, 0x0080}, +{0x000C5B, 0x0001}, {0x000C5D, 0x0004}, -{0x000C5E, 0x0080}, +{0x000C5E, 0x0001}, {0x000C60, 0x0004}, {0x000C62, 0x0010}, -{0x000C64, 0x0080}, +{0x000C64, 0x0001}, {0x000C66, 0x0002}, -{0x000C70, 0x0080}, +{0x000C70, 0x0001}, {0x000C77, 0x0020}, {0x000C78, 0x0002}, {0x000C7F, 0x0040}, @@ -416,124 +420,124 @@ const std::vector> unicode_ranges_flags = { // st {0x000C81, 0x0010}, {0x000C84, 0x0020}, {0x000C85, 0x0004}, -{0x000C8D, 0x0080}, +{0x000C8D, 0x0001}, {0x000C8E, 0x0004}, -{0x000C91, 0x0080}, +{0x000C91, 0x0001}, {0x000C92, 0x0004}, -{0x000CA9, 0x0080}, +{0x000CA9, 0x0001}, {0x000CAA, 0x0004}, -{0x000CB4, 0x0080}, +{0x000CB4, 0x0001}, {0x000CB5, 0x0004}, -{0x000CBA, 0x0080}, +{0x000CBA, 0x0001}, {0x000CBC, 0x0010}, {0x000CBD, 0x0004}, {0x000CBE, 0x0010}, -{0x000CC5, 0x0080}, +{0x000CC5, 0x0001}, {0x000CC6, 0x0010}, -{0x000CC9, 0x0080}, +{0x000CC9, 0x0001}, {0x000CCA, 0x0010}, -{0x000CCE, 0x0080}, +{0x000CCE, 0x0001}, {0x000CD5, 0x0010}, -{0x000CD7, 0x0080}, +{0x000CD7, 0x0001}, {0x000CDD, 0x0004}, -{0x000CDF, 0x0080}, +{0x000CDF, 0x0001}, {0x000CE0, 0x0004}, {0x000CE2, 0x0010}, -{0x000CE4, 0x0080}, +{0x000CE4, 0x0001}, {0x000CE6, 0x0002}, -{0x000CF0, 0x0080}, +{0x000CF0, 0x0001}, {0x000CF1, 0x0004}, {0x000CF3, 0x0010}, -{0x000CF4, 0x0080}, +{0x000CF4, 0x0001}, {0x000D00, 0x0010}, {0x000D04, 0x0004}, -{0x000D0D, 0x0080}, +{0x000D0D, 0x0001}, {0x000D0E, 0x0004}, -{0x000D11, 0x0080}, +{0x000D11, 0x0001}, {0x000D12, 0x0004}, {0x000D3B, 0x0010}, {0x000D3D, 0x0004}, {0x000D3E, 0x0010}, -{0x000D45, 0x0080}, +{0x000D45, 0x0001}, {0x000D46, 0x0010}, -{0x000D49, 0x0080}, +{0x000D49, 0x0001}, {0x000D4A, 0x0010}, {0x000D4E, 0x0004}, {0x000D4F, 0x0040}, -{0x000D50, 0x0080}, +{0x000D50, 0x0001}, {0x000D54, 0x0004}, {0x000D57, 0x0010}, {0x000D58, 0x0002}, {0x000D5F, 0x0004}, {0x000D62, 0x0010}, -{0x000D64, 0x0080}, +{0x000D64, 0x0001}, {0x000D66, 0x0002}, {0x000D79, 0x0040}, {0x000D7A, 0x0004}, -{0x000D80, 0x0080}, +{0x000D80, 0x0001}, {0x000D81, 0x0010}, -{0x000D84, 0x0080}, +{0x000D84, 0x0001}, {0x000D85, 0x0004}, -{0x000D97, 0x0080}, +{0x000D97, 0x0001}, {0x000D9A, 0x0004}, -{0x000DB2, 0x0080}, +{0x000DB2, 0x0001}, {0x000DB3, 0x0004}, -{0x000DBC, 0x0080}, +{0x000DBC, 0x0001}, {0x000DBD, 0x0004}, -{0x000DBE, 0x0080}, +{0x000DBE, 0x0001}, {0x000DC0, 0x0004}, -{0x000DC7, 0x0080}, +{0x000DC7, 0x0001}, {0x000DCA, 0x0010}, -{0x000DCB, 0x0080}, +{0x000DCB, 0x0001}, {0x000DCF, 0x0010}, -{0x000DD5, 0x0080}, +{0x000DD5, 0x0001}, {0x000DD6, 0x0010}, -{0x000DD7, 0x0080}, +{0x000DD7, 0x0001}, {0x000DD8, 0x0010}, -{0x000DE0, 0x0080}, +{0x000DE0, 0x0001}, {0x000DE6, 0x0002}, -{0x000DF0, 0x0080}, +{0x000DF0, 0x0001}, {0x000DF2, 0x0010}, {0x000DF4, 0x0020}, -{0x000DF5, 0x0080}, +{0x000DF5, 0x0001}, {0x000E01, 0x0004}, {0x000E31, 0x0010}, {0x000E32, 0x0004}, {0x000E34, 0x0010}, -{0x000E3B, 0x0080}, +{0x000E3B, 0x0001}, {0x000E3F, 0x0040}, {0x000E40, 0x0004}, {0x000E47, 0x0010}, {0x000E4F, 0x0020}, {0x000E50, 0x0002}, {0x000E5A, 0x0020}, -{0x000E5C, 0x0080}, +{0x000E5C, 0x0001}, {0x000E81, 0x0004}, -{0x000E83, 0x0080}, +{0x000E83, 0x0001}, {0x000E84, 0x0004}, -{0x000E85, 0x0080}, +{0x000E85, 0x0001}, {0x000E86, 0x0004}, -{0x000E8B, 0x0080}, +{0x000E8B, 0x0001}, {0x000E8C, 0x0004}, -{0x000EA4, 0x0080}, +{0x000EA4, 0x0001}, {0x000EA5, 0x0004}, -{0x000EA6, 0x0080}, +{0x000EA6, 0x0001}, {0x000EA7, 0x0004}, {0x000EB1, 0x0010}, {0x000EB2, 0x0004}, {0x000EB4, 0x0010}, {0x000EBD, 0x0004}, -{0x000EBE, 0x0080}, +{0x000EBE, 0x0001}, {0x000EC0, 0x0004}, -{0x000EC5, 0x0080}, +{0x000EC5, 0x0001}, {0x000EC6, 0x0004}, -{0x000EC7, 0x0080}, +{0x000EC7, 0x0001}, {0x000EC8, 0x0010}, -{0x000ECF, 0x0080}, +{0x000ECF, 0x0001}, {0x000ED0, 0x0002}, -{0x000EDA, 0x0080}, +{0x000EDA, 0x0001}, {0x000EDC, 0x0004}, -{0x000EE0, 0x0080}, +{0x000EE0, 0x0001}, {0x000F00, 0x0004}, {0x000F01, 0x0040}, {0x000F04, 0x0020}, @@ -552,26 +556,26 @@ const std::vector> unicode_ranges_flags = { // st {0x000F3A, 0x0020}, {0x000F3E, 0x0010}, {0x000F40, 0x0004}, -{0x000F48, 0x0080}, +{0x000F48, 0x0001}, {0x000F49, 0x0004}, -{0x000F6D, 0x0080}, +{0x000F6D, 0x0001}, {0x000F71, 0x0010}, {0x000F85, 0x0020}, {0x000F86, 0x0010}, {0x000F88, 0x0004}, {0x000F8D, 0x0010}, -{0x000F98, 0x0080}, +{0x000F98, 0x0001}, {0x000F99, 0x0010}, -{0x000FBD, 0x0080}, +{0x000FBD, 0x0001}, {0x000FBE, 0x0040}, {0x000FC6, 0x0010}, {0x000FC7, 0x0040}, -{0x000FCD, 0x0080}, +{0x000FCD, 0x0001}, {0x000FCE, 0x0040}, {0x000FD0, 0x0020}, {0x000FD5, 0x0040}, {0x000FD9, 0x0020}, -{0x000FDB, 0x0080}, +{0x000FDB, 0x0001}, {0x001000, 0x0004}, {0x00102B, 0x0010}, {0x00103F, 0x0004}, @@ -595,56 +599,56 @@ const std::vector> unicode_ranges_flags = { // st {0x00109A, 0x0010}, {0x00109E, 0x0040}, {0x0010A0, 0x0004}, -{0x0010C6, 0x0080}, +{0x0010C6, 0x0001}, {0x0010C7, 0x0004}, -{0x0010C8, 0x0080}, +{0x0010C8, 0x0001}, {0x0010CD, 0x0004}, -{0x0010CE, 0x0080}, +{0x0010CE, 0x0001}, {0x0010D0, 0x0004}, {0x0010FB, 0x0020}, {0x0010FC, 0x0004}, -{0x001249, 0x0080}, +{0x001249, 0x0001}, {0x00124A, 0x0004}, -{0x00124E, 0x0080}, +{0x00124E, 0x0001}, {0x001250, 0x0004}, -{0x001257, 0x0080}, +{0x001257, 0x0001}, {0x001258, 0x0004}, -{0x001259, 0x0080}, +{0x001259, 0x0001}, {0x00125A, 0x0004}, -{0x00125E, 0x0080}, +{0x00125E, 0x0001}, {0x001260, 0x0004}, -{0x001289, 0x0080}, +{0x001289, 0x0001}, {0x00128A, 0x0004}, -{0x00128E, 0x0080}, +{0x00128E, 0x0001}, {0x001290, 0x0004}, -{0x0012B1, 0x0080}, +{0x0012B1, 0x0001}, {0x0012B2, 0x0004}, -{0x0012B6, 0x0080}, +{0x0012B6, 0x0001}, {0x0012B8, 0x0004}, -{0x0012BF, 0x0080}, +{0x0012BF, 0x0001}, {0x0012C0, 0x0004}, -{0x0012C1, 0x0080}, +{0x0012C1, 0x0001}, {0x0012C2, 0x0004}, -{0x0012C6, 0x0080}, +{0x0012C6, 0x0001}, {0x0012C8, 0x0004}, -{0x0012D7, 0x0080}, +{0x0012D7, 0x0001}, {0x0012D8, 0x0004}, -{0x001311, 0x0080}, +{0x001311, 0x0001}, {0x001312, 0x0004}, -{0x001316, 0x0080}, +{0x001316, 0x0001}, {0x001318, 0x0004}, -{0x00135B, 0x0080}, +{0x00135B, 0x0001}, {0x00135D, 0x0010}, {0x001360, 0x0020}, {0x001369, 0x0002}, -{0x00137D, 0x0080}, +{0x00137D, 0x0001}, {0x001380, 0x0004}, {0x001390, 0x0040}, -{0x00139A, 0x0080}, +{0x00139A, 0x0001}, {0x0013A0, 0x0004}, -{0x0013F6, 0x0080}, +{0x0013F6, 0x0001}, {0x0013F8, 0x0004}, -{0x0013FE, 0x0080}, +{0x0013FE, 0x0001}, {0x001400, 0x0020}, {0x001401, 0x0004}, {0x00166D, 0x0040}, @@ -653,28 +657,28 @@ const std::vector> unicode_ranges_flags = { // st {0x001680, 0x0008}, {0x001681, 0x0004}, {0x00169B, 0x0020}, -{0x00169D, 0x0080}, +{0x00169D, 0x0001}, {0x0016A0, 0x0004}, {0x0016EB, 0x0020}, {0x0016EE, 0x0002}, {0x0016F1, 0x0004}, -{0x0016F9, 0x0080}, +{0x0016F9, 0x0001}, {0x001700, 0x0004}, {0x001712, 0x0010}, -{0x001716, 0x0080}, +{0x001716, 0x0001}, {0x00171F, 0x0004}, {0x001732, 0x0010}, {0x001735, 0x0020}, -{0x001737, 0x0080}, +{0x001737, 0x0001}, {0x001740, 0x0004}, {0x001752, 0x0010}, -{0x001754, 0x0080}, +{0x001754, 0x0001}, {0x001760, 0x0004}, -{0x00176D, 0x0080}, +{0x00176D, 0x0001}, {0x00176E, 0x0004}, -{0x001771, 0x0080}, +{0x001771, 0x0001}, {0x001772, 0x0010}, -{0x001774, 0x0080}, +{0x001774, 0x0001}, {0x001780, 0x0004}, {0x0017B4, 0x0010}, {0x0017D4, 0x0020}, @@ -683,80 +687,80 @@ const std::vector> unicode_ranges_flags = { // st {0x0017DB, 0x0040}, {0x0017DC, 0x0004}, {0x0017DD, 0x0010}, -{0x0017DE, 0x0080}, +{0x0017DE, 0x0001}, {0x0017E0, 0x0002}, -{0x0017EA, 0x0080}, +{0x0017EA, 0x0001}, {0x0017F0, 0x0002}, -{0x0017FA, 0x0080}, +{0x0017FA, 0x0001}, {0x001800, 0x0020}, {0x00180B, 0x0010}, {0x00180E, 0x0080}, {0x00180F, 0x0010}, {0x001810, 0x0002}, -{0x00181A, 0x0080}, +{0x00181A, 0x0001}, {0x001820, 0x0004}, -{0x001879, 0x0080}, +{0x001879, 0x0001}, {0x001880, 0x0004}, {0x001885, 0x0010}, {0x001887, 0x0004}, {0x0018A9, 0x0010}, {0x0018AA, 0x0004}, -{0x0018AB, 0x0080}, +{0x0018AB, 0x0001}, {0x0018B0, 0x0004}, -{0x0018F6, 0x0080}, +{0x0018F6, 0x0001}, {0x001900, 0x0004}, -{0x00191F, 0x0080}, +{0x00191F, 0x0001}, {0x001920, 0x0010}, -{0x00192C, 0x0080}, +{0x00192C, 0x0001}, {0x001930, 0x0010}, -{0x00193C, 0x0080}, +{0x00193C, 0x0001}, {0x001940, 0x0040}, -{0x001941, 0x0080}, +{0x001941, 0x0001}, {0x001944, 0x0020}, {0x001946, 0x0002}, {0x001950, 0x0004}, -{0x00196E, 0x0080}, +{0x00196E, 0x0001}, {0x001970, 0x0004}, -{0x001975, 0x0080}, +{0x001975, 0x0001}, {0x001980, 0x0004}, -{0x0019AC, 0x0080}, +{0x0019AC, 0x0001}, {0x0019B0, 0x0004}, -{0x0019CA, 0x0080}, +{0x0019CA, 0x0001}, {0x0019D0, 0x0002}, -{0x0019DB, 0x0080}, +{0x0019DB, 0x0001}, {0x0019DE, 0x0040}, {0x001A00, 0x0004}, {0x001A17, 0x0010}, -{0x001A1C, 0x0080}, +{0x001A1C, 0x0001}, {0x001A1E, 0x0020}, {0x001A20, 0x0004}, {0x001A55, 0x0010}, -{0x001A5F, 0x0080}, +{0x001A5F, 0x0001}, {0x001A60, 0x0010}, -{0x001A7D, 0x0080}, +{0x001A7D, 0x0001}, {0x001A7F, 0x0010}, {0x001A80, 0x0002}, -{0x001A8A, 0x0080}, +{0x001A8A, 0x0001}, {0x001A90, 0x0002}, -{0x001A9A, 0x0080}, +{0x001A9A, 0x0001}, {0x001AA0, 0x0020}, {0x001AA7, 0x0004}, {0x001AA8, 0x0020}, -{0x001AAE, 0x0080}, +{0x001AAE, 0x0001}, {0x001AB0, 0x0010}, -{0x001ACF, 0x0080}, +{0x001ACF, 0x0001}, {0x001B00, 0x0010}, {0x001B05, 0x0004}, {0x001B34, 0x0010}, {0x001B45, 0x0004}, -{0x001B4D, 0x0080}, +{0x001B4D, 0x0001}, {0x001B50, 0x0002}, {0x001B5A, 0x0020}, {0x001B61, 0x0040}, {0x001B6B, 0x0010}, {0x001B74, 0x0040}, {0x001B7D, 0x0020}, -{0x001B7F, 0x0080}, +{0x001B7F, 0x0001}, {0x001B80, 0x0010}, {0x001B83, 0x0004}, {0x001BA1, 0x0010}, @@ -764,25 +768,25 @@ const std::vector> unicode_ranges_flags = { // st {0x001BB0, 0x0002}, {0x001BBA, 0x0004}, {0x001BE6, 0x0010}, -{0x001BF4, 0x0080}, +{0x001BF4, 0x0001}, {0x001BFC, 0x0020}, {0x001C00, 0x0004}, {0x001C24, 0x0010}, -{0x001C38, 0x0080}, +{0x001C38, 0x0001}, {0x001C3B, 0x0020}, {0x001C40, 0x0002}, -{0x001C4A, 0x0080}, +{0x001C4A, 0x0001}, {0x001C4D, 0x0004}, {0x001C50, 0x0002}, {0x001C5A, 0x0004}, {0x001C7E, 0x0020}, {0x001C80, 0x0004}, -{0x001C89, 0x0080}, +{0x001C89, 0x0001}, {0x001C90, 0x0004}, -{0x001CBB, 0x0080}, +{0x001CBB, 0x0001}, {0x001CBD, 0x0004}, {0x001CC0, 0x0020}, -{0x001CC8, 0x0080}, +{0x001CC8, 0x0001}, {0x001CD0, 0x0010}, {0x001CD3, 0x0020}, {0x001CD4, 0x0010}, @@ -793,50 +797,50 @@ const std::vector> unicode_ranges_flags = { // st {0x001CF5, 0x0004}, {0x001CF7, 0x0010}, {0x001CFA, 0x0004}, -{0x001CFB, 0x0080}, +{0x001CFB, 0x0001}, {0x001D00, 0x0004}, {0x001DC0, 0x0010}, {0x001E00, 0x0004}, -{0x001F16, 0x0080}, +{0x001F16, 0x0001}, {0x001F18, 0x0004}, -{0x001F1E, 0x0080}, +{0x001F1E, 0x0001}, {0x001F20, 0x0004}, -{0x001F46, 0x0080}, +{0x001F46, 0x0001}, {0x001F48, 0x0004}, -{0x001F4E, 0x0080}, +{0x001F4E, 0x0001}, {0x001F50, 0x0004}, -{0x001F58, 0x0080}, +{0x001F58, 0x0001}, {0x001F59, 0x0004}, -{0x001F5A, 0x0080}, +{0x001F5A, 0x0001}, {0x001F5B, 0x0004}, -{0x001F5C, 0x0080}, +{0x001F5C, 0x0001}, {0x001F5D, 0x0004}, -{0x001F5E, 0x0080}, +{0x001F5E, 0x0001}, {0x001F5F, 0x0004}, -{0x001F7E, 0x0080}, +{0x001F7E, 0x0001}, {0x001F80, 0x0004}, -{0x001FB5, 0x0080}, +{0x001FB5, 0x0001}, {0x001FB6, 0x0004}, {0x001FBD, 0x0040}, {0x001FBE, 0x0004}, {0x001FBF, 0x0040}, {0x001FC2, 0x0004}, -{0x001FC5, 0x0080}, +{0x001FC5, 0x0001}, {0x001FC6, 0x0004}, {0x001FCD, 0x0040}, {0x001FD0, 0x0004}, -{0x001FD4, 0x0080}, +{0x001FD4, 0x0001}, {0x001FD6, 0x0004}, -{0x001FDC, 0x0080}, +{0x001FDC, 0x0001}, {0x001FDD, 0x0040}, {0x001FE0, 0x0004}, {0x001FED, 0x0040}, -{0x001FF0, 0x0080}, +{0x001FF0, 0x0001}, {0x001FF2, 0x0004}, -{0x001FF5, 0x0080}, +{0x001FF5, 0x0001}, {0x001FF6, 0x0004}, {0x001FFD, 0x0040}, -{0x001FFF, 0x0080}, +{0x001FFF, 0x0001}, {0x002000, 0x0008}, {0x00200B, 0x0080}, {0x002010, 0x0020}, @@ -850,9 +854,11 @@ const std::vector> unicode_ranges_flags = { // st {0x002053, 0x0020}, {0x00205F, 0x0008}, {0x002060, 0x0080}, +{0x002065, 0x0001}, +{0x002066, 0x0080}, {0x002070, 0x0002}, {0x002071, 0x0004}, -{0x002072, 0x0080}, +{0x002072, 0x0001}, {0x002074, 0x0002}, {0x00207A, 0x0040}, {0x00207D, 0x0020}, @@ -860,13 +866,13 @@ const std::vector> unicode_ranges_flags = { // st {0x002080, 0x0002}, {0x00208A, 0x0040}, {0x00208D, 0x0020}, -{0x00208F, 0x0080}, +{0x00208F, 0x0001}, {0x002090, 0x0004}, -{0x00209D, 0x0080}, +{0x00209D, 0x0001}, {0x0020A0, 0x0040}, -{0x0020C1, 0x0080}, +{0x0020C1, 0x0001}, {0x0020D0, 0x0010}, -{0x0020F1, 0x0080}, +{0x0020F1, 0x0001}, {0x002100, 0x0040}, {0x002102, 0x0004}, {0x002103, 0x0040}, @@ -898,15 +904,15 @@ const std::vector> unicode_ranges_flags = { // st {0x002183, 0x0004}, {0x002185, 0x0002}, {0x00218A, 0x0040}, -{0x00218C, 0x0080}, +{0x00218C, 0x0001}, {0x002190, 0x0040}, {0x002308, 0x0020}, {0x00230C, 0x0040}, {0x002329, 0x0020}, {0x00232B, 0x0040}, -{0x002427, 0x0080}, +{0x002427, 0x0001}, {0x002440, 0x0040}, -{0x00244B, 0x0080}, +{0x00244B, 0x0001}, {0x002460, 0x0002}, {0x00249C, 0x0040}, {0x0024EA, 0x0002}, @@ -924,62 +930,62 @@ const std::vector> unicode_ranges_flags = { // st {0x0029DC, 0x0040}, {0x0029FC, 0x0020}, {0x0029FE, 0x0040}, -{0x002B74, 0x0080}, +{0x002B74, 0x0001}, {0x002B76, 0x0040}, -{0x002B96, 0x0080}, +{0x002B96, 0x0001}, {0x002B97, 0x0040}, {0x002C00, 0x0004}, {0x002CE5, 0x0040}, {0x002CEB, 0x0004}, {0x002CEF, 0x0010}, {0x002CF2, 0x0004}, -{0x002CF4, 0x0080}, +{0x002CF4, 0x0001}, {0x002CF9, 0x0020}, {0x002CFD, 0x0002}, {0x002CFE, 0x0020}, {0x002D00, 0x0004}, -{0x002D26, 0x0080}, +{0x002D26, 0x0001}, {0x002D27, 0x0004}, -{0x002D28, 0x0080}, +{0x002D28, 0x0001}, {0x002D2D, 0x0004}, -{0x002D2E, 0x0080}, +{0x002D2E, 0x0001}, {0x002D30, 0x0004}, -{0x002D68, 0x0080}, +{0x002D68, 0x0001}, {0x002D6F, 0x0004}, {0x002D70, 0x0020}, -{0x002D71, 0x0080}, +{0x002D71, 0x0001}, {0x002D7F, 0x0010}, {0x002D80, 0x0004}, -{0x002D97, 0x0080}, +{0x002D97, 0x0001}, {0x002DA0, 0x0004}, -{0x002DA7, 0x0080}, +{0x002DA7, 0x0001}, {0x002DA8, 0x0004}, -{0x002DAF, 0x0080}, +{0x002DAF, 0x0001}, {0x002DB0, 0x0004}, -{0x002DB7, 0x0080}, +{0x002DB7, 0x0001}, {0x002DB8, 0x0004}, -{0x002DBF, 0x0080}, +{0x002DBF, 0x0001}, {0x002DC0, 0x0004}, -{0x002DC7, 0x0080}, +{0x002DC7, 0x0001}, {0x002DC8, 0x0004}, -{0x002DCF, 0x0080}, +{0x002DCF, 0x0001}, {0x002DD0, 0x0004}, -{0x002DD7, 0x0080}, +{0x002DD7, 0x0001}, {0x002DD8, 0x0004}, -{0x002DDF, 0x0080}, +{0x002DDF, 0x0001}, {0x002DE0, 0x0010}, {0x002E00, 0x0020}, {0x002E2F, 0x0004}, {0x002E30, 0x0020}, {0x002E50, 0x0040}, {0x002E52, 0x0020}, -{0x002E5E, 0x0080}, +{0x002E5E, 0x0001}, {0x002E80, 0x0040}, -{0x002E9A, 0x0080}, +{0x002E9A, 0x0001}, {0x002E9B, 0x0040}, -{0x002EF4, 0x0080}, +{0x002EF4, 0x0001}, {0x002F00, 0x0040}, -{0x002FD6, 0x0080}, +{0x002FD6, 0x0001}, {0x002FF0, 0x0040}, {0x003000, 0x0008}, {0x003001, 0x0020}, @@ -999,9 +1005,9 @@ const std::vector> unicode_ranges_flags = { // st {0x00303B, 0x0004}, {0x00303D, 0x0020}, {0x00303E, 0x0040}, -{0x003040, 0x0080}, +{0x003040, 0x0001}, {0x003041, 0x0004}, -{0x003097, 0x0080}, +{0x003097, 0x0001}, {0x003099, 0x0010}, {0x00309B, 0x0040}, {0x00309D, 0x0004}, @@ -1009,21 +1015,21 @@ const std::vector> unicode_ranges_flags = { // st {0x0030A1, 0x0004}, {0x0030FB, 0x0020}, {0x0030FC, 0x0004}, -{0x003100, 0x0080}, +{0x003100, 0x0001}, {0x003105, 0x0004}, -{0x003130, 0x0080}, +{0x003130, 0x0001}, {0x003131, 0x0004}, -{0x00318F, 0x0080}, +{0x00318F, 0x0001}, {0x003190, 0x0040}, {0x003192, 0x0002}, {0x003196, 0x0040}, {0x0031A0, 0x0004}, {0x0031C0, 0x0040}, -{0x0031E4, 0x0080}, +{0x0031E4, 0x0001}, {0x0031EF, 0x0040}, {0x0031F0, 0x0004}, {0x003200, 0x0040}, -{0x00321F, 0x0080}, +{0x00321F, 0x0001}, {0x003220, 0x0002}, {0x00322A, 0x0040}, {0x003248, 0x0002}, @@ -1037,9 +1043,9 @@ const std::vector> unicode_ranges_flags = { // st {0x003400, 0x0004}, {0x004DC0, 0x0040}, {0x004E00, 0x0004}, -{0x00A48D, 0x0080}, +{0x00A48D, 0x0001}, {0x00A490, 0x0040}, -{0x00A4C7, 0x0080}, +{0x00A4C7, 0x0001}, {0x00A4D0, 0x0004}, {0x00A4FE, 0x0020}, {0x00A500, 0x0004}, @@ -1047,7 +1053,7 @@ const std::vector> unicode_ranges_flags = { // st {0x00A610, 0x0004}, {0x00A620, 0x0002}, {0x00A62A, 0x0004}, -{0x00A62C, 0x0080}, +{0x00A62C, 0x0001}, {0x00A640, 0x0004}, {0x00A66F, 0x0010}, {0x00A673, 0x0020}, @@ -1059,20 +1065,20 @@ const std::vector> unicode_ranges_flags = { // st {0x00A6E6, 0x0002}, {0x00A6F0, 0x0010}, {0x00A6F2, 0x0020}, -{0x00A6F8, 0x0080}, +{0x00A6F8, 0x0001}, {0x00A700, 0x0040}, {0x00A717, 0x0004}, {0x00A720, 0x0040}, {0x00A722, 0x0004}, {0x00A789, 0x0040}, {0x00A78B, 0x0004}, -{0x00A7CB, 0x0080}, +{0x00A7CB, 0x0001}, {0x00A7D0, 0x0004}, -{0x00A7D2, 0x0080}, +{0x00A7D2, 0x0001}, {0x00A7D3, 0x0004}, -{0x00A7D4, 0x0080}, +{0x00A7D4, 0x0001}, {0x00A7D5, 0x0004}, -{0x00A7DA, 0x0080}, +{0x00A7DA, 0x0001}, {0x00A7F2, 0x0004}, {0x00A802, 0x0010}, {0x00A803, 0x0004}, @@ -1083,20 +1089,20 @@ const std::vector> unicode_ranges_flags = { // st {0x00A823, 0x0010}, {0x00A828, 0x0040}, {0x00A82C, 0x0010}, -{0x00A82D, 0x0080}, +{0x00A82D, 0x0001}, {0x00A830, 0x0002}, {0x00A836, 0x0040}, -{0x00A83A, 0x0080}, +{0x00A83A, 0x0001}, {0x00A840, 0x0004}, {0x00A874, 0x0020}, -{0x00A878, 0x0080}, +{0x00A878, 0x0001}, {0x00A880, 0x0010}, {0x00A882, 0x0004}, {0x00A8B4, 0x0010}, -{0x00A8C6, 0x0080}, +{0x00A8C6, 0x0001}, {0x00A8CE, 0x0020}, {0x00A8D0, 0x0002}, -{0x00A8DA, 0x0080}, +{0x00A8DA, 0x0001}, {0x00A8E0, 0x0010}, {0x00A8F2, 0x0004}, {0x00A8F8, 0x0020}, @@ -1110,35 +1116,35 @@ const std::vector> unicode_ranges_flags = { // st {0x00A92E, 0x0020}, {0x00A930, 0x0004}, {0x00A947, 0x0010}, -{0x00A954, 0x0080}, +{0x00A954, 0x0001}, {0x00A95F, 0x0020}, {0x00A960, 0x0004}, -{0x00A97D, 0x0080}, +{0x00A97D, 0x0001}, {0x00A980, 0x0010}, {0x00A984, 0x0004}, {0x00A9B3, 0x0010}, {0x00A9C1, 0x0020}, -{0x00A9CE, 0x0080}, +{0x00A9CE, 0x0001}, {0x00A9CF, 0x0004}, {0x00A9D0, 0x0002}, -{0x00A9DA, 0x0080}, +{0x00A9DA, 0x0001}, {0x00A9DE, 0x0020}, {0x00A9E0, 0x0004}, {0x00A9E5, 0x0010}, {0x00A9E6, 0x0004}, {0x00A9F0, 0x0002}, {0x00A9FA, 0x0004}, -{0x00A9FF, 0x0080}, +{0x00A9FF, 0x0001}, {0x00AA00, 0x0004}, {0x00AA29, 0x0010}, -{0x00AA37, 0x0080}, +{0x00AA37, 0x0001}, {0x00AA40, 0x0004}, {0x00AA43, 0x0010}, {0x00AA44, 0x0004}, {0x00AA4C, 0x0010}, -{0x00AA4E, 0x0080}, +{0x00AA4E, 0x0001}, {0x00AA50, 0x0002}, -{0x00AA5A, 0x0080}, +{0x00AA5A, 0x0001}, {0x00AA5C, 0x0020}, {0x00AA60, 0x0004}, {0x00AA77, 0x0040}, @@ -1155,7 +1161,7 @@ const std::vector> unicode_ranges_flags = { // st {0x00AAC0, 0x0004}, {0x00AAC1, 0x0010}, {0x00AAC2, 0x0004}, -{0x00AAC3, 0x0080}, +{0x00AAC3, 0x0001}, {0x00AADB, 0x0004}, {0x00AADE, 0x0020}, {0x00AAE0, 0x0004}, @@ -1163,90 +1169,93 @@ const std::vector> unicode_ranges_flags = { // st {0x00AAF0, 0x0020}, {0x00AAF2, 0x0004}, {0x00AAF5, 0x0010}, -{0x00AAF7, 0x0080}, +{0x00AAF7, 0x0001}, {0x00AB01, 0x0004}, -{0x00AB07, 0x0080}, +{0x00AB07, 0x0001}, {0x00AB09, 0x0004}, -{0x00AB0F, 0x0080}, +{0x00AB0F, 0x0001}, {0x00AB11, 0x0004}, -{0x00AB17, 0x0080}, +{0x00AB17, 0x0001}, {0x00AB20, 0x0004}, -{0x00AB27, 0x0080}, +{0x00AB27, 0x0001}, {0x00AB28, 0x0004}, -{0x00AB2F, 0x0080}, +{0x00AB2F, 0x0001}, {0x00AB30, 0x0004}, {0x00AB5B, 0x0040}, {0x00AB5C, 0x0004}, {0x00AB6A, 0x0040}, -{0x00AB6C, 0x0080}, +{0x00AB6C, 0x0001}, {0x00AB70, 0x0004}, {0x00ABE3, 0x0010}, {0x00ABEB, 0x0020}, {0x00ABEC, 0x0010}, -{0x00ABEE, 0x0080}, +{0x00ABEE, 0x0001}, {0x00ABF0, 0x0002}, -{0x00ABFA, 0x0080}, +{0x00ABFA, 0x0001}, {0x00AC00, 0x0004}, -{0x00D7A4, 0x0080}, +{0x00D7A4, 0x0001}, {0x00D7B0, 0x0004}, -{0x00D7C7, 0x0080}, +{0x00D7C7, 0x0001}, {0x00D7CB, 0x0004}, -{0x00D7FC, 0x0080}, +{0x00D7FC, 0x0001}, +{0x00D800, 0x0080}, {0x00F900, 0x0004}, -{0x00FA6E, 0x0080}, +{0x00FA6E, 0x0001}, {0x00FA70, 0x0004}, -{0x00FADA, 0x0080}, +{0x00FADA, 0x0001}, {0x00FB00, 0x0004}, -{0x00FB07, 0x0080}, +{0x00FB07, 0x0001}, {0x00FB13, 0x0004}, -{0x00FB18, 0x0080}, +{0x00FB18, 0x0001}, {0x00FB1D, 0x0004}, {0x00FB1E, 0x0010}, {0x00FB1F, 0x0004}, {0x00FB29, 0x0040}, {0x00FB2A, 0x0004}, -{0x00FB37, 0x0080}, +{0x00FB37, 0x0001}, {0x00FB38, 0x0004}, -{0x00FB3D, 0x0080}, +{0x00FB3D, 0x0001}, {0x00FB3E, 0x0004}, -{0x00FB3F, 0x0080}, +{0x00FB3F, 0x0001}, {0x00FB40, 0x0004}, -{0x00FB42, 0x0080}, +{0x00FB42, 0x0001}, {0x00FB43, 0x0004}, -{0x00FB45, 0x0080}, +{0x00FB45, 0x0001}, {0x00FB46, 0x0004}, {0x00FBB2, 0x0040}, -{0x00FBC3, 0x0080}, +{0x00FBC3, 0x0001}, {0x00FBD3, 0x0004}, {0x00FD3E, 0x0020}, {0x00FD40, 0x0040}, {0x00FD50, 0x0004}, -{0x00FD90, 0x0080}, +{0x00FD90, 0x0001}, {0x00FD92, 0x0004}, -{0x00FDC8, 0x0080}, +{0x00FDC8, 0x0001}, {0x00FDCF, 0x0040}, -{0x00FDD0, 0x0080}, +{0x00FDD0, 0x0001}, {0x00FDF0, 0x0004}, {0x00FDFC, 0x0040}, {0x00FE00, 0x0010}, {0x00FE10, 0x0020}, -{0x00FE1A, 0x0080}, +{0x00FE1A, 0x0001}, {0x00FE20, 0x0010}, {0x00FE30, 0x0020}, -{0x00FE53, 0x0080}, +{0x00FE53, 0x0001}, {0x00FE54, 0x0020}, {0x00FE62, 0x0040}, {0x00FE63, 0x0020}, {0x00FE64, 0x0040}, -{0x00FE67, 0x0080}, +{0x00FE67, 0x0001}, {0x00FE68, 0x0020}, {0x00FE69, 0x0040}, {0x00FE6A, 0x0020}, -{0x00FE6C, 0x0080}, +{0x00FE6C, 0x0001}, {0x00FE70, 0x0004}, -{0x00FE75, 0x0080}, +{0x00FE75, 0x0001}, {0x00FE76, 0x0004}, -{0x00FEFD, 0x0080}, +{0x00FEFD, 0x0001}, +{0x00FEFF, 0x0080}, +{0x00FF00, 0x0001}, {0x00FF01, 0x0020}, {0x00FF04, 0x0040}, {0x00FF05, 0x0020}, @@ -1268,260 +1277,261 @@ const std::vector> unicode_ranges_flags = { // st {0x00FF5E, 0x0040}, {0x00FF5F, 0x0020}, {0x00FF66, 0x0004}, -{0x00FFBF, 0x0080}, +{0x00FFBF, 0x0001}, {0x00FFC2, 0x0004}, -{0x00FFC8, 0x0080}, +{0x00FFC8, 0x0001}, {0x00FFCA, 0x0004}, -{0x00FFD0, 0x0080}, +{0x00FFD0, 0x0001}, {0x00FFD2, 0x0004}, -{0x00FFD8, 0x0080}, +{0x00FFD8, 0x0001}, {0x00FFDA, 0x0004}, -{0x00FFDD, 0x0080}, +{0x00FFDD, 0x0001}, {0x00FFE0, 0x0040}, -{0x00FFE7, 0x0080}, +{0x00FFE7, 0x0001}, {0x00FFE8, 0x0040}, -{0x00FFEF, 0x0080}, +{0x00FFEF, 0x0001}, +{0x00FFF9, 0x0080}, {0x00FFFC, 0x0040}, -{0x00FFFE, 0x0080}, +{0x00FFFE, 0x0001}, {0x010000, 0x0004}, -{0x01000C, 0x0080}, +{0x01000C, 0x0001}, {0x01000D, 0x0004}, -{0x010027, 0x0080}, +{0x010027, 0x0001}, {0x010028, 0x0004}, -{0x01003B, 0x0080}, +{0x01003B, 0x0001}, {0x01003C, 0x0004}, -{0x01003E, 0x0080}, +{0x01003E, 0x0001}, {0x01003F, 0x0004}, -{0x01004E, 0x0080}, +{0x01004E, 0x0001}, {0x010050, 0x0004}, -{0x01005E, 0x0080}, +{0x01005E, 0x0001}, {0x010080, 0x0004}, -{0x0100FB, 0x0080}, +{0x0100FB, 0x0001}, {0x010100, 0x0020}, -{0x010103, 0x0080}, +{0x010103, 0x0001}, {0x010107, 0x0002}, -{0x010134, 0x0080}, +{0x010134, 0x0001}, {0x010137, 0x0040}, {0x010140, 0x0002}, {0x010179, 0x0040}, {0x01018A, 0x0002}, {0x01018C, 0x0040}, -{0x01018F, 0x0080}, +{0x01018F, 0x0001}, {0x010190, 0x0040}, -{0x01019D, 0x0080}, +{0x01019D, 0x0001}, {0x0101A0, 0x0040}, -{0x0101A1, 0x0080}, +{0x0101A1, 0x0001}, {0x0101D0, 0x0040}, {0x0101FD, 0x0010}, -{0x0101FE, 0x0080}, +{0x0101FE, 0x0001}, {0x010280, 0x0004}, -{0x01029D, 0x0080}, +{0x01029D, 0x0001}, {0x0102A0, 0x0004}, -{0x0102D1, 0x0080}, +{0x0102D1, 0x0001}, {0x0102E0, 0x0010}, {0x0102E1, 0x0002}, -{0x0102FC, 0x0080}, +{0x0102FC, 0x0001}, {0x010300, 0x0004}, {0x010320, 0x0002}, -{0x010324, 0x0080}, +{0x010324, 0x0001}, {0x01032D, 0x0004}, {0x010341, 0x0002}, {0x010342, 0x0004}, {0x01034A, 0x0002}, -{0x01034B, 0x0080}, +{0x01034B, 0x0001}, {0x010350, 0x0004}, {0x010376, 0x0010}, -{0x01037B, 0x0080}, +{0x01037B, 0x0001}, {0x010380, 0x0004}, -{0x01039E, 0x0080}, +{0x01039E, 0x0001}, {0x01039F, 0x0020}, {0x0103A0, 0x0004}, -{0x0103C4, 0x0080}, +{0x0103C4, 0x0001}, {0x0103C8, 0x0004}, {0x0103D0, 0x0020}, {0x0103D1, 0x0002}, -{0x0103D6, 0x0080}, +{0x0103D6, 0x0001}, {0x010400, 0x0004}, -{0x01049E, 0x0080}, +{0x01049E, 0x0001}, {0x0104A0, 0x0002}, -{0x0104AA, 0x0080}, +{0x0104AA, 0x0001}, {0x0104B0, 0x0004}, -{0x0104D4, 0x0080}, +{0x0104D4, 0x0001}, {0x0104D8, 0x0004}, -{0x0104FC, 0x0080}, +{0x0104FC, 0x0001}, {0x010500, 0x0004}, -{0x010528, 0x0080}, +{0x010528, 0x0001}, {0x010530, 0x0004}, -{0x010564, 0x0080}, +{0x010564, 0x0001}, {0x01056F, 0x0020}, {0x010570, 0x0004}, -{0x01057B, 0x0080}, +{0x01057B, 0x0001}, {0x01057C, 0x0004}, -{0x01058B, 0x0080}, +{0x01058B, 0x0001}, {0x01058C, 0x0004}, -{0x010593, 0x0080}, +{0x010593, 0x0001}, {0x010594, 0x0004}, -{0x010596, 0x0080}, +{0x010596, 0x0001}, {0x010597, 0x0004}, -{0x0105A2, 0x0080}, +{0x0105A2, 0x0001}, {0x0105A3, 0x0004}, -{0x0105B2, 0x0080}, +{0x0105B2, 0x0001}, {0x0105B3, 0x0004}, -{0x0105BA, 0x0080}, +{0x0105BA, 0x0001}, {0x0105BB, 0x0004}, -{0x0105BD, 0x0080}, +{0x0105BD, 0x0001}, {0x010600, 0x0004}, -{0x010737, 0x0080}, +{0x010737, 0x0001}, {0x010740, 0x0004}, -{0x010756, 0x0080}, +{0x010756, 0x0001}, {0x010760, 0x0004}, -{0x010768, 0x0080}, +{0x010768, 0x0001}, {0x010780, 0x0004}, -{0x010786, 0x0080}, +{0x010786, 0x0001}, {0x010787, 0x0004}, -{0x0107B1, 0x0080}, +{0x0107B1, 0x0001}, {0x0107B2, 0x0004}, -{0x0107BB, 0x0080}, +{0x0107BB, 0x0001}, {0x010800, 0x0004}, -{0x010806, 0x0080}, +{0x010806, 0x0001}, {0x010808, 0x0004}, -{0x010809, 0x0080}, +{0x010809, 0x0001}, {0x01080A, 0x0004}, -{0x010836, 0x0080}, +{0x010836, 0x0001}, {0x010837, 0x0004}, -{0x010839, 0x0080}, +{0x010839, 0x0001}, {0x01083C, 0x0004}, -{0x01083D, 0x0080}, +{0x01083D, 0x0001}, {0x01083F, 0x0004}, -{0x010856, 0x0080}, +{0x010856, 0x0001}, {0x010857, 0x0020}, {0x010858, 0x0002}, {0x010860, 0x0004}, {0x010877, 0x0040}, {0x010879, 0x0002}, {0x010880, 0x0004}, -{0x01089F, 0x0080}, +{0x01089F, 0x0001}, {0x0108A7, 0x0002}, -{0x0108B0, 0x0080}, +{0x0108B0, 0x0001}, {0x0108E0, 0x0004}, -{0x0108F3, 0x0080}, +{0x0108F3, 0x0001}, {0x0108F4, 0x0004}, -{0x0108F6, 0x0080}, +{0x0108F6, 0x0001}, {0x0108FB, 0x0002}, {0x010900, 0x0004}, {0x010916, 0x0002}, -{0x01091C, 0x0080}, +{0x01091C, 0x0001}, {0x01091F, 0x0020}, {0x010920, 0x0004}, -{0x01093A, 0x0080}, +{0x01093A, 0x0001}, {0x01093F, 0x0020}, -{0x010940, 0x0080}, +{0x010940, 0x0001}, {0x010980, 0x0004}, -{0x0109B8, 0x0080}, +{0x0109B8, 0x0001}, {0x0109BC, 0x0002}, {0x0109BE, 0x0004}, {0x0109C0, 0x0002}, -{0x0109D0, 0x0080}, +{0x0109D0, 0x0001}, {0x0109D2, 0x0002}, {0x010A00, 0x0004}, {0x010A01, 0x0010}, -{0x010A04, 0x0080}, +{0x010A04, 0x0001}, {0x010A05, 0x0010}, -{0x010A07, 0x0080}, +{0x010A07, 0x0001}, {0x010A0C, 0x0010}, {0x010A10, 0x0004}, -{0x010A14, 0x0080}, +{0x010A14, 0x0001}, {0x010A15, 0x0004}, -{0x010A18, 0x0080}, +{0x010A18, 0x0001}, {0x010A19, 0x0004}, -{0x010A36, 0x0080}, +{0x010A36, 0x0001}, {0x010A38, 0x0010}, -{0x010A3B, 0x0080}, +{0x010A3B, 0x0001}, {0x010A3F, 0x0010}, {0x010A40, 0x0002}, -{0x010A49, 0x0080}, +{0x010A49, 0x0001}, {0x010A50, 0x0020}, -{0x010A59, 0x0080}, +{0x010A59, 0x0001}, {0x010A60, 0x0004}, {0x010A7D, 0x0002}, {0x010A7F, 0x0020}, {0x010A80, 0x0004}, {0x010A9D, 0x0002}, -{0x010AA0, 0x0080}, +{0x010AA0, 0x0001}, {0x010AC0, 0x0004}, {0x010AC8, 0x0040}, {0x010AC9, 0x0004}, {0x010AE5, 0x0010}, -{0x010AE7, 0x0080}, +{0x010AE7, 0x0001}, {0x010AEB, 0x0002}, {0x010AF0, 0x0020}, -{0x010AF7, 0x0080}, +{0x010AF7, 0x0001}, {0x010B00, 0x0004}, -{0x010B36, 0x0080}, +{0x010B36, 0x0001}, {0x010B39, 0x0020}, {0x010B40, 0x0004}, -{0x010B56, 0x0080}, +{0x010B56, 0x0001}, {0x010B58, 0x0002}, {0x010B60, 0x0004}, -{0x010B73, 0x0080}, +{0x010B73, 0x0001}, {0x010B78, 0x0002}, {0x010B80, 0x0004}, -{0x010B92, 0x0080}, +{0x010B92, 0x0001}, {0x010B99, 0x0020}, -{0x010B9D, 0x0080}, +{0x010B9D, 0x0001}, {0x010BA9, 0x0002}, -{0x010BB0, 0x0080}, +{0x010BB0, 0x0001}, {0x010C00, 0x0004}, -{0x010C49, 0x0080}, +{0x010C49, 0x0001}, {0x010C80, 0x0004}, -{0x010CB3, 0x0080}, +{0x010CB3, 0x0001}, {0x010CC0, 0x0004}, -{0x010CF3, 0x0080}, +{0x010CF3, 0x0001}, {0x010CFA, 0x0002}, {0x010D00, 0x0004}, {0x010D24, 0x0010}, -{0x010D28, 0x0080}, +{0x010D28, 0x0001}, {0x010D30, 0x0002}, -{0x010D3A, 0x0080}, +{0x010D3A, 0x0001}, {0x010E60, 0x0002}, -{0x010E7F, 0x0080}, +{0x010E7F, 0x0001}, {0x010E80, 0x0004}, -{0x010EAA, 0x0080}, +{0x010EAA, 0x0001}, {0x010EAB, 0x0010}, {0x010EAD, 0x0020}, -{0x010EAE, 0x0080}, +{0x010EAE, 0x0001}, {0x010EB0, 0x0004}, -{0x010EB2, 0x0080}, +{0x010EB2, 0x0001}, {0x010EFD, 0x0010}, {0x010F00, 0x0004}, {0x010F1D, 0x0002}, {0x010F27, 0x0004}, -{0x010F28, 0x0080}, +{0x010F28, 0x0001}, {0x010F30, 0x0004}, {0x010F46, 0x0010}, {0x010F51, 0x0002}, {0x010F55, 0x0020}, -{0x010F5A, 0x0080}, +{0x010F5A, 0x0001}, {0x010F70, 0x0004}, {0x010F82, 0x0010}, {0x010F86, 0x0020}, -{0x010F8A, 0x0080}, +{0x010F8A, 0x0001}, {0x010FB0, 0x0004}, {0x010FC5, 0x0002}, -{0x010FCC, 0x0080}, +{0x010FCC, 0x0001}, {0x010FE0, 0x0004}, -{0x010FF7, 0x0080}, +{0x010FF7, 0x0001}, {0x011000, 0x0010}, {0x011003, 0x0004}, {0x011038, 0x0010}, {0x011047, 0x0020}, -{0x01104E, 0x0080}, +{0x01104E, 0x0001}, {0x011052, 0x0002}, {0x011070, 0x0010}, {0x011071, 0x0004}, {0x011073, 0x0010}, {0x011075, 0x0004}, -{0x011076, 0x0080}, +{0x011076, 0x0001}, {0x01107F, 0x0010}, {0x011083, 0x0004}, {0x0110B0, 0x0010}, @@ -1529,26 +1539,28 @@ const std::vector> unicode_ranges_flags = { // st {0x0110BD, 0x0080}, {0x0110BE, 0x0020}, {0x0110C2, 0x0010}, -{0x0110C3, 0x0080}, +{0x0110C3, 0x0001}, +{0x0110CD, 0x0080}, +{0x0110CE, 0x0001}, {0x0110D0, 0x0004}, -{0x0110E9, 0x0080}, +{0x0110E9, 0x0001}, {0x0110F0, 0x0002}, -{0x0110FA, 0x0080}, +{0x0110FA, 0x0001}, {0x011100, 0x0010}, {0x011103, 0x0004}, {0x011127, 0x0010}, -{0x011135, 0x0080}, +{0x011135, 0x0001}, {0x011136, 0x0002}, {0x011140, 0x0020}, {0x011144, 0x0004}, {0x011145, 0x0010}, {0x011147, 0x0004}, -{0x011148, 0x0080}, +{0x011148, 0x0001}, {0x011150, 0x0004}, {0x011173, 0x0010}, {0x011174, 0x0020}, {0x011176, 0x0004}, -{0x011177, 0x0080}, +{0x011177, 0x0001}, {0x011180, 0x0010}, {0x011183, 0x0004}, {0x0111B3, 0x0010}, @@ -1562,159 +1574,159 @@ const std::vector> unicode_ranges_flags = { // st {0x0111DB, 0x0020}, {0x0111DC, 0x0004}, {0x0111DD, 0x0020}, -{0x0111E0, 0x0080}, +{0x0111E0, 0x0001}, {0x0111E1, 0x0002}, -{0x0111F5, 0x0080}, +{0x0111F5, 0x0001}, {0x011200, 0x0004}, -{0x011212, 0x0080}, +{0x011212, 0x0001}, {0x011213, 0x0004}, {0x01122C, 0x0010}, {0x011238, 0x0020}, {0x01123E, 0x0010}, {0x01123F, 0x0004}, {0x011241, 0x0010}, -{0x011242, 0x0080}, +{0x011242, 0x0001}, {0x011280, 0x0004}, -{0x011287, 0x0080}, +{0x011287, 0x0001}, {0x011288, 0x0004}, -{0x011289, 0x0080}, +{0x011289, 0x0001}, {0x01128A, 0x0004}, -{0x01128E, 0x0080}, +{0x01128E, 0x0001}, {0x01128F, 0x0004}, -{0x01129E, 0x0080}, +{0x01129E, 0x0001}, {0x01129F, 0x0004}, {0x0112A9, 0x0020}, -{0x0112AA, 0x0080}, +{0x0112AA, 0x0001}, {0x0112B0, 0x0004}, {0x0112DF, 0x0010}, -{0x0112EB, 0x0080}, +{0x0112EB, 0x0001}, {0x0112F0, 0x0002}, -{0x0112FA, 0x0080}, +{0x0112FA, 0x0001}, {0x011300, 0x0010}, -{0x011304, 0x0080}, +{0x011304, 0x0001}, {0x011305, 0x0004}, -{0x01130D, 0x0080}, +{0x01130D, 0x0001}, {0x01130F, 0x0004}, -{0x011311, 0x0080}, +{0x011311, 0x0001}, {0x011313, 0x0004}, -{0x011329, 0x0080}, +{0x011329, 0x0001}, {0x01132A, 0x0004}, -{0x011331, 0x0080}, +{0x011331, 0x0001}, {0x011332, 0x0004}, -{0x011334, 0x0080}, +{0x011334, 0x0001}, {0x011335, 0x0004}, -{0x01133A, 0x0080}, +{0x01133A, 0x0001}, {0x01133B, 0x0010}, {0x01133D, 0x0004}, {0x01133E, 0x0010}, -{0x011345, 0x0080}, +{0x011345, 0x0001}, {0x011347, 0x0010}, -{0x011349, 0x0080}, +{0x011349, 0x0001}, {0x01134B, 0x0010}, -{0x01134E, 0x0080}, +{0x01134E, 0x0001}, {0x011350, 0x0004}, -{0x011351, 0x0080}, +{0x011351, 0x0001}, {0x011357, 0x0010}, -{0x011358, 0x0080}, +{0x011358, 0x0001}, {0x01135D, 0x0004}, {0x011362, 0x0010}, -{0x011364, 0x0080}, +{0x011364, 0x0001}, {0x011366, 0x0010}, -{0x01136D, 0x0080}, +{0x01136D, 0x0001}, {0x011370, 0x0010}, -{0x011375, 0x0080}, +{0x011375, 0x0001}, {0x011400, 0x0004}, {0x011435, 0x0010}, {0x011447, 0x0004}, {0x01144B, 0x0020}, {0x011450, 0x0002}, {0x01145A, 0x0020}, -{0x01145C, 0x0080}, +{0x01145C, 0x0001}, {0x01145D, 0x0020}, {0x01145E, 0x0010}, {0x01145F, 0x0004}, -{0x011462, 0x0080}, +{0x011462, 0x0001}, {0x011480, 0x0004}, {0x0114B0, 0x0010}, {0x0114C4, 0x0004}, {0x0114C6, 0x0020}, {0x0114C7, 0x0004}, -{0x0114C8, 0x0080}, +{0x0114C8, 0x0001}, {0x0114D0, 0x0002}, -{0x0114DA, 0x0080}, +{0x0114DA, 0x0001}, {0x011580, 0x0004}, {0x0115AF, 0x0010}, -{0x0115B6, 0x0080}, +{0x0115B6, 0x0001}, {0x0115B8, 0x0010}, {0x0115C1, 0x0020}, {0x0115D8, 0x0004}, {0x0115DC, 0x0010}, -{0x0115DE, 0x0080}, +{0x0115DE, 0x0001}, {0x011600, 0x0004}, {0x011630, 0x0010}, {0x011641, 0x0020}, {0x011644, 0x0004}, -{0x011645, 0x0080}, +{0x011645, 0x0001}, {0x011650, 0x0002}, -{0x01165A, 0x0080}, +{0x01165A, 0x0001}, {0x011660, 0x0020}, -{0x01166D, 0x0080}, +{0x01166D, 0x0001}, {0x011680, 0x0004}, {0x0116AB, 0x0010}, {0x0116B8, 0x0004}, {0x0116B9, 0x0020}, -{0x0116BA, 0x0080}, +{0x0116BA, 0x0001}, {0x0116C0, 0x0002}, -{0x0116CA, 0x0080}, +{0x0116CA, 0x0001}, {0x011700, 0x0004}, -{0x01171B, 0x0080}, +{0x01171B, 0x0001}, {0x01171D, 0x0010}, -{0x01172C, 0x0080}, +{0x01172C, 0x0001}, {0x011730, 0x0002}, {0x01173C, 0x0020}, {0x01173F, 0x0040}, {0x011740, 0x0004}, -{0x011747, 0x0080}, +{0x011747, 0x0001}, {0x011800, 0x0004}, {0x01182C, 0x0010}, {0x01183B, 0x0020}, -{0x01183C, 0x0080}, +{0x01183C, 0x0001}, {0x0118A0, 0x0004}, {0x0118E0, 0x0002}, -{0x0118F3, 0x0080}, +{0x0118F3, 0x0001}, {0x0118FF, 0x0004}, -{0x011907, 0x0080}, +{0x011907, 0x0001}, {0x011909, 0x0004}, -{0x01190A, 0x0080}, +{0x01190A, 0x0001}, {0x01190C, 0x0004}, -{0x011914, 0x0080}, +{0x011914, 0x0001}, {0x011915, 0x0004}, -{0x011917, 0x0080}, +{0x011917, 0x0001}, {0x011918, 0x0004}, {0x011930, 0x0010}, -{0x011936, 0x0080}, +{0x011936, 0x0001}, {0x011937, 0x0010}, -{0x011939, 0x0080}, +{0x011939, 0x0001}, {0x01193B, 0x0010}, {0x01193F, 0x0004}, {0x011940, 0x0010}, {0x011941, 0x0004}, {0x011942, 0x0010}, {0x011944, 0x0020}, -{0x011947, 0x0080}, +{0x011947, 0x0001}, {0x011950, 0x0002}, -{0x01195A, 0x0080}, +{0x01195A, 0x0001}, {0x0119A0, 0x0004}, -{0x0119A8, 0x0080}, +{0x0119A8, 0x0001}, {0x0119AA, 0x0004}, {0x0119D1, 0x0010}, -{0x0119D8, 0x0080}, +{0x0119D8, 0x0001}, {0x0119DA, 0x0010}, {0x0119E1, 0x0004}, {0x0119E2, 0x0020}, {0x0119E3, 0x0004}, {0x0119E4, 0x0010}, -{0x0119E5, 0x0080}, +{0x0119E5, 0x0001}, {0x011A00, 0x0004}, {0x011A01, 0x0010}, {0x011A0B, 0x0004}, @@ -1723,7 +1735,7 @@ const std::vector> unicode_ranges_flags = { // st {0x011A3B, 0x0010}, {0x011A3F, 0x0020}, {0x011A47, 0x0010}, -{0x011A48, 0x0080}, +{0x011A48, 0x0001}, {0x011A50, 0x0004}, {0x011A51, 0x0010}, {0x011A5C, 0x0004}, @@ -1731,117 +1743,117 @@ const std::vector> unicode_ranges_flags = { // st {0x011A9A, 0x0020}, {0x011A9D, 0x0004}, {0x011A9E, 0x0020}, -{0x011AA3, 0x0080}, +{0x011AA3, 0x0001}, {0x011AB0, 0x0004}, -{0x011AF9, 0x0080}, +{0x011AF9, 0x0001}, {0x011B00, 0x0020}, -{0x011B0A, 0x0080}, +{0x011B0A, 0x0001}, {0x011C00, 0x0004}, -{0x011C09, 0x0080}, +{0x011C09, 0x0001}, {0x011C0A, 0x0004}, {0x011C2F, 0x0010}, -{0x011C37, 0x0080}, +{0x011C37, 0x0001}, {0x011C38, 0x0010}, {0x011C40, 0x0004}, {0x011C41, 0x0020}, -{0x011C46, 0x0080}, +{0x011C46, 0x0001}, {0x011C50, 0x0002}, -{0x011C6D, 0x0080}, +{0x011C6D, 0x0001}, {0x011C70, 0x0020}, {0x011C72, 0x0004}, -{0x011C90, 0x0080}, +{0x011C90, 0x0001}, {0x011C92, 0x0010}, -{0x011CA8, 0x0080}, +{0x011CA8, 0x0001}, {0x011CA9, 0x0010}, -{0x011CB7, 0x0080}, +{0x011CB7, 0x0001}, {0x011D00, 0x0004}, -{0x011D07, 0x0080}, +{0x011D07, 0x0001}, {0x011D08, 0x0004}, -{0x011D0A, 0x0080}, +{0x011D0A, 0x0001}, {0x011D0B, 0x0004}, {0x011D31, 0x0010}, -{0x011D37, 0x0080}, +{0x011D37, 0x0001}, {0x011D3A, 0x0010}, -{0x011D3B, 0x0080}, +{0x011D3B, 0x0001}, {0x011D3C, 0x0010}, -{0x011D3E, 0x0080}, +{0x011D3E, 0x0001}, {0x011D3F, 0x0010}, {0x011D46, 0x0004}, {0x011D47, 0x0010}, -{0x011D48, 0x0080}, +{0x011D48, 0x0001}, {0x011D50, 0x0002}, -{0x011D5A, 0x0080}, +{0x011D5A, 0x0001}, {0x011D60, 0x0004}, -{0x011D66, 0x0080}, +{0x011D66, 0x0001}, {0x011D67, 0x0004}, -{0x011D69, 0x0080}, +{0x011D69, 0x0001}, {0x011D6A, 0x0004}, {0x011D8A, 0x0010}, -{0x011D8F, 0x0080}, +{0x011D8F, 0x0001}, {0x011D90, 0x0010}, -{0x011D92, 0x0080}, +{0x011D92, 0x0001}, {0x011D93, 0x0010}, {0x011D98, 0x0004}, -{0x011D99, 0x0080}, +{0x011D99, 0x0001}, {0x011DA0, 0x0002}, -{0x011DAA, 0x0080}, +{0x011DAA, 0x0001}, {0x011EE0, 0x0004}, {0x011EF3, 0x0010}, {0x011EF7, 0x0020}, -{0x011EF9, 0x0080}, +{0x011EF9, 0x0001}, {0x011F00, 0x0010}, {0x011F02, 0x0004}, {0x011F03, 0x0010}, {0x011F04, 0x0004}, -{0x011F11, 0x0080}, +{0x011F11, 0x0001}, {0x011F12, 0x0004}, {0x011F34, 0x0010}, -{0x011F3B, 0x0080}, +{0x011F3B, 0x0001}, {0x011F3E, 0x0010}, {0x011F43, 0x0020}, {0x011F50, 0x0002}, -{0x011F5A, 0x0080}, +{0x011F5A, 0x0001}, {0x011FB0, 0x0004}, -{0x011FB1, 0x0080}, +{0x011FB1, 0x0001}, {0x011FC0, 0x0002}, {0x011FD5, 0x0040}, -{0x011FF2, 0x0080}, +{0x011FF2, 0x0001}, {0x011FFF, 0x0020}, {0x012000, 0x0004}, -{0x01239A, 0x0080}, +{0x01239A, 0x0001}, {0x012400, 0x0002}, -{0x01246F, 0x0080}, +{0x01246F, 0x0001}, {0x012470, 0x0020}, -{0x012475, 0x0080}, +{0x012475, 0x0001}, {0x012480, 0x0004}, -{0x012544, 0x0080}, +{0x012544, 0x0001}, {0x012F90, 0x0004}, {0x012FF1, 0x0020}, -{0x012FF3, 0x0080}, +{0x012FF3, 0x0001}, {0x013000, 0x0004}, {0x013430, 0x0080}, {0x013440, 0x0010}, {0x013441, 0x0004}, {0x013447, 0x0010}, -{0x013456, 0x0080}, +{0x013456, 0x0001}, {0x014400, 0x0004}, -{0x014647, 0x0080}, +{0x014647, 0x0001}, {0x016800, 0x0004}, -{0x016A39, 0x0080}, +{0x016A39, 0x0001}, {0x016A40, 0x0004}, -{0x016A5F, 0x0080}, +{0x016A5F, 0x0001}, {0x016A60, 0x0002}, -{0x016A6A, 0x0080}, +{0x016A6A, 0x0001}, {0x016A6E, 0x0020}, {0x016A70, 0x0004}, -{0x016ABF, 0x0080}, +{0x016ABF, 0x0001}, {0x016AC0, 0x0002}, -{0x016ACA, 0x0080}, +{0x016ACA, 0x0001}, {0x016AD0, 0x0004}, -{0x016AEE, 0x0080}, +{0x016AEE, 0x0001}, {0x016AF0, 0x0010}, {0x016AF5, 0x0020}, -{0x016AF6, 0x0080}, +{0x016AF6, 0x0001}, {0x016B00, 0x0004}, {0x016B30, 0x0010}, {0x016B37, 0x0020}, @@ -1849,81 +1861,82 @@ const std::vector> unicode_ranges_flags = { // st {0x016B40, 0x0004}, {0x016B44, 0x0020}, {0x016B45, 0x0040}, -{0x016B46, 0x0080}, +{0x016B46, 0x0001}, {0x016B50, 0x0002}, -{0x016B5A, 0x0080}, +{0x016B5A, 0x0001}, {0x016B5B, 0x0002}, -{0x016B62, 0x0080}, +{0x016B62, 0x0001}, {0x016B63, 0x0004}, -{0x016B78, 0x0080}, +{0x016B78, 0x0001}, {0x016B7D, 0x0004}, -{0x016B90, 0x0080}, +{0x016B90, 0x0001}, {0x016E40, 0x0004}, {0x016E80, 0x0002}, {0x016E97, 0x0020}, -{0x016E9B, 0x0080}, +{0x016E9B, 0x0001}, {0x016F00, 0x0004}, -{0x016F4B, 0x0080}, +{0x016F4B, 0x0001}, {0x016F4F, 0x0010}, {0x016F50, 0x0004}, {0x016F51, 0x0010}, -{0x016F88, 0x0080}, +{0x016F88, 0x0001}, {0x016F8F, 0x0010}, {0x016F93, 0x0004}, -{0x016FA0, 0x0080}, +{0x016FA0, 0x0001}, {0x016FE0, 0x0004}, {0x016FE2, 0x0020}, {0x016FE3, 0x0004}, {0x016FE4, 0x0010}, -{0x016FE5, 0x0080}, +{0x016FE5, 0x0001}, {0x016FF0, 0x0010}, -{0x016FF2, 0x0080}, +{0x016FF2, 0x0001}, {0x017000, 0x0004}, -{0x0187F8, 0x0080}, +{0x0187F8, 0x0001}, {0x018800, 0x0004}, -{0x018CD6, 0x0080}, +{0x018CD6, 0x0001}, {0x018D00, 0x0004}, -{0x018D09, 0x0080}, +{0x018D09, 0x0001}, {0x01AFF0, 0x0004}, -{0x01AFF4, 0x0080}, +{0x01AFF4, 0x0001}, {0x01AFF5, 0x0004}, -{0x01AFFC, 0x0080}, +{0x01AFFC, 0x0001}, {0x01AFFD, 0x0004}, -{0x01AFFF, 0x0080}, +{0x01AFFF, 0x0001}, {0x01B000, 0x0004}, -{0x01B123, 0x0080}, +{0x01B123, 0x0001}, {0x01B132, 0x0004}, -{0x01B133, 0x0080}, +{0x01B133, 0x0001}, {0x01B150, 0x0004}, -{0x01B153, 0x0080}, +{0x01B153, 0x0001}, {0x01B155, 0x0004}, -{0x01B156, 0x0080}, +{0x01B156, 0x0001}, {0x01B164, 0x0004}, -{0x01B168, 0x0080}, +{0x01B168, 0x0001}, {0x01B170, 0x0004}, -{0x01B2FC, 0x0080}, +{0x01B2FC, 0x0001}, {0x01BC00, 0x0004}, -{0x01BC6B, 0x0080}, +{0x01BC6B, 0x0001}, {0x01BC70, 0x0004}, -{0x01BC7D, 0x0080}, +{0x01BC7D, 0x0001}, {0x01BC80, 0x0004}, -{0x01BC89, 0x0080}, +{0x01BC89, 0x0001}, {0x01BC90, 0x0004}, -{0x01BC9A, 0x0080}, +{0x01BC9A, 0x0001}, {0x01BC9C, 0x0040}, {0x01BC9D, 0x0010}, {0x01BC9F, 0x0020}, {0x01BCA0, 0x0080}, +{0x01BCA4, 0x0001}, {0x01CF00, 0x0010}, -{0x01CF2E, 0x0080}, +{0x01CF2E, 0x0001}, {0x01CF30, 0x0010}, -{0x01CF47, 0x0080}, +{0x01CF47, 0x0001}, {0x01CF50, 0x0040}, -{0x01CFC4, 0x0080}, +{0x01CFC4, 0x0001}, {0x01D000, 0x0040}, -{0x01D0F6, 0x0080}, +{0x01D0F6, 0x0001}, {0x01D100, 0x0040}, -{0x01D127, 0x0080}, +{0x01D127, 0x0001}, {0x01D129, 0x0040}, {0x01D165, 0x0010}, {0x01D16A, 0x0040}, @@ -1935,57 +1948,57 @@ const std::vector> unicode_ranges_flags = { // st {0x01D18C, 0x0040}, {0x01D1AA, 0x0010}, {0x01D1AE, 0x0040}, -{0x01D1EB, 0x0080}, +{0x01D1EB, 0x0001}, {0x01D200, 0x0040}, {0x01D242, 0x0010}, {0x01D245, 0x0040}, -{0x01D246, 0x0080}, +{0x01D246, 0x0001}, {0x01D2C0, 0x0002}, -{0x01D2D4, 0x0080}, +{0x01D2D4, 0x0001}, {0x01D2E0, 0x0002}, -{0x01D2F4, 0x0080}, +{0x01D2F4, 0x0001}, {0x01D300, 0x0040}, -{0x01D357, 0x0080}, +{0x01D357, 0x0001}, {0x01D360, 0x0002}, -{0x01D379, 0x0080}, +{0x01D379, 0x0001}, {0x01D400, 0x0004}, -{0x01D455, 0x0080}, +{0x01D455, 0x0001}, {0x01D456, 0x0004}, -{0x01D49D, 0x0080}, +{0x01D49D, 0x0001}, {0x01D49E, 0x0004}, -{0x01D4A0, 0x0080}, +{0x01D4A0, 0x0001}, {0x01D4A2, 0x0004}, -{0x01D4A3, 0x0080}, +{0x01D4A3, 0x0001}, {0x01D4A5, 0x0004}, -{0x01D4A7, 0x0080}, +{0x01D4A7, 0x0001}, {0x01D4A9, 0x0004}, -{0x01D4AD, 0x0080}, +{0x01D4AD, 0x0001}, {0x01D4AE, 0x0004}, -{0x01D4BA, 0x0080}, +{0x01D4BA, 0x0001}, {0x01D4BB, 0x0004}, -{0x01D4BC, 0x0080}, +{0x01D4BC, 0x0001}, {0x01D4BD, 0x0004}, -{0x01D4C4, 0x0080}, +{0x01D4C4, 0x0001}, {0x01D4C5, 0x0004}, -{0x01D506, 0x0080}, +{0x01D506, 0x0001}, {0x01D507, 0x0004}, -{0x01D50B, 0x0080}, +{0x01D50B, 0x0001}, {0x01D50D, 0x0004}, -{0x01D515, 0x0080}, +{0x01D515, 0x0001}, {0x01D516, 0x0004}, -{0x01D51D, 0x0080}, +{0x01D51D, 0x0001}, {0x01D51E, 0x0004}, -{0x01D53A, 0x0080}, +{0x01D53A, 0x0001}, {0x01D53B, 0x0004}, -{0x01D53F, 0x0080}, +{0x01D53F, 0x0001}, {0x01D540, 0x0004}, -{0x01D545, 0x0080}, +{0x01D545, 0x0001}, {0x01D546, 0x0004}, -{0x01D547, 0x0080}, +{0x01D547, 0x0001}, {0x01D54A, 0x0004}, -{0x01D551, 0x0080}, +{0x01D551, 0x0001}, {0x01D552, 0x0004}, -{0x01D6A6, 0x0080}, +{0x01D6A6, 0x0001}, {0x01D6A8, 0x0004}, {0x01D6C1, 0x0040}, {0x01D6C2, 0x0004}, @@ -2007,7 +2020,7 @@ const std::vector> unicode_ranges_flags = { // st {0x01D7AA, 0x0004}, {0x01D7C3, 0x0040}, {0x01D7C4, 0x0004}, -{0x01D7CC, 0x0080}, +{0x01D7CC, 0x0001}, {0x01D7CE, 0x0002}, {0x01D800, 0x0040}, {0x01DA00, 0x0010}, @@ -2019,251 +2032,283 @@ const std::vector> unicode_ranges_flags = { // st {0x01DA84, 0x0010}, {0x01DA85, 0x0040}, {0x01DA87, 0x0020}, -{0x01DA8C, 0x0080}, +{0x01DA8C, 0x0001}, {0x01DA9B, 0x0010}, -{0x01DAA0, 0x0080}, +{0x01DAA0, 0x0001}, {0x01DAA1, 0x0010}, -{0x01DAB0, 0x0080}, +{0x01DAB0, 0x0001}, {0x01DF00, 0x0004}, -{0x01DF1F, 0x0080}, +{0x01DF1F, 0x0001}, {0x01DF25, 0x0004}, -{0x01DF2B, 0x0080}, +{0x01DF2B, 0x0001}, {0x01E000, 0x0010}, -{0x01E007, 0x0080}, +{0x01E007, 0x0001}, {0x01E008, 0x0010}, -{0x01E019, 0x0080}, +{0x01E019, 0x0001}, {0x01E01B, 0x0010}, -{0x01E022, 0x0080}, +{0x01E022, 0x0001}, {0x01E023, 0x0010}, -{0x01E025, 0x0080}, +{0x01E025, 0x0001}, {0x01E026, 0x0010}, -{0x01E02B, 0x0080}, +{0x01E02B, 0x0001}, {0x01E030, 0x0004}, -{0x01E06E, 0x0080}, +{0x01E06E, 0x0001}, {0x01E08F, 0x0010}, -{0x01E090, 0x0080}, +{0x01E090, 0x0001}, {0x01E100, 0x0004}, -{0x01E12D, 0x0080}, +{0x01E12D, 0x0001}, {0x01E130, 0x0010}, {0x01E137, 0x0004}, -{0x01E13E, 0x0080}, +{0x01E13E, 0x0001}, {0x01E140, 0x0002}, -{0x01E14A, 0x0080}, +{0x01E14A, 0x0001}, {0x01E14E, 0x0004}, {0x01E14F, 0x0040}, -{0x01E150, 0x0080}, +{0x01E150, 0x0001}, {0x01E290, 0x0004}, {0x01E2AE, 0x0010}, -{0x01E2AF, 0x0080}, +{0x01E2AF, 0x0001}, {0x01E2C0, 0x0004}, {0x01E2EC, 0x0010}, {0x01E2F0, 0x0002}, -{0x01E2FA, 0x0080}, +{0x01E2FA, 0x0001}, {0x01E2FF, 0x0040}, -{0x01E300, 0x0080}, +{0x01E300, 0x0001}, {0x01E4D0, 0x0004}, {0x01E4EC, 0x0010}, {0x01E4F0, 0x0002}, -{0x01E4FA, 0x0080}, +{0x01E4FA, 0x0001}, {0x01E7E0, 0x0004}, -{0x01E7E7, 0x0080}, +{0x01E7E7, 0x0001}, {0x01E7E8, 0x0004}, -{0x01E7EC, 0x0080}, +{0x01E7EC, 0x0001}, {0x01E7ED, 0x0004}, -{0x01E7EF, 0x0080}, +{0x01E7EF, 0x0001}, {0x01E7F0, 0x0004}, -{0x01E7FF, 0x0080}, +{0x01E7FF, 0x0001}, {0x01E800, 0x0004}, -{0x01E8C5, 0x0080}, +{0x01E8C5, 0x0001}, {0x01E8C7, 0x0002}, {0x01E8D0, 0x0010}, -{0x01E8D7, 0x0080}, +{0x01E8D7, 0x0001}, {0x01E900, 0x0004}, {0x01E944, 0x0010}, {0x01E94B, 0x0004}, -{0x01E94C, 0x0080}, +{0x01E94C, 0x0001}, {0x01E950, 0x0002}, -{0x01E95A, 0x0080}, +{0x01E95A, 0x0001}, {0x01E95E, 0x0020}, -{0x01E960, 0x0080}, +{0x01E960, 0x0001}, {0x01EC71, 0x0002}, {0x01ECAC, 0x0040}, {0x01ECAD, 0x0002}, {0x01ECB0, 0x0040}, {0x01ECB1, 0x0002}, -{0x01ECB5, 0x0080}, +{0x01ECB5, 0x0001}, {0x01ED01, 0x0002}, {0x01ED2E, 0x0040}, {0x01ED2F, 0x0002}, -{0x01ED3E, 0x0080}, +{0x01ED3E, 0x0001}, {0x01EE00, 0x0004}, -{0x01EE04, 0x0080}, +{0x01EE04, 0x0001}, {0x01EE05, 0x0004}, -{0x01EE20, 0x0080}, +{0x01EE20, 0x0001}, {0x01EE21, 0x0004}, -{0x01EE23, 0x0080}, +{0x01EE23, 0x0001}, {0x01EE24, 0x0004}, -{0x01EE25, 0x0080}, +{0x01EE25, 0x0001}, {0x01EE27, 0x0004}, -{0x01EE28, 0x0080}, +{0x01EE28, 0x0001}, {0x01EE29, 0x0004}, -{0x01EE33, 0x0080}, +{0x01EE33, 0x0001}, {0x01EE34, 0x0004}, -{0x01EE38, 0x0080}, +{0x01EE38, 0x0001}, {0x01EE39, 0x0004}, -{0x01EE3A, 0x0080}, +{0x01EE3A, 0x0001}, {0x01EE3B, 0x0004}, -{0x01EE3C, 0x0080}, +{0x01EE3C, 0x0001}, {0x01EE42, 0x0004}, -{0x01EE43, 0x0080}, +{0x01EE43, 0x0001}, {0x01EE47, 0x0004}, -{0x01EE48, 0x0080}, +{0x01EE48, 0x0001}, {0x01EE49, 0x0004}, -{0x01EE4A, 0x0080}, +{0x01EE4A, 0x0001}, {0x01EE4B, 0x0004}, -{0x01EE4C, 0x0080}, +{0x01EE4C, 0x0001}, {0x01EE4D, 0x0004}, -{0x01EE50, 0x0080}, +{0x01EE50, 0x0001}, {0x01EE51, 0x0004}, -{0x01EE53, 0x0080}, +{0x01EE53, 0x0001}, {0x01EE54, 0x0004}, -{0x01EE55, 0x0080}, +{0x01EE55, 0x0001}, {0x01EE57, 0x0004}, -{0x01EE58, 0x0080}, +{0x01EE58, 0x0001}, {0x01EE59, 0x0004}, -{0x01EE5A, 0x0080}, +{0x01EE5A, 0x0001}, {0x01EE5B, 0x0004}, -{0x01EE5C, 0x0080}, +{0x01EE5C, 0x0001}, {0x01EE5D, 0x0004}, -{0x01EE5E, 0x0080}, +{0x01EE5E, 0x0001}, {0x01EE5F, 0x0004}, -{0x01EE60, 0x0080}, +{0x01EE60, 0x0001}, {0x01EE61, 0x0004}, -{0x01EE63, 0x0080}, +{0x01EE63, 0x0001}, {0x01EE64, 0x0004}, -{0x01EE65, 0x0080}, +{0x01EE65, 0x0001}, {0x01EE67, 0x0004}, -{0x01EE6B, 0x0080}, +{0x01EE6B, 0x0001}, {0x01EE6C, 0x0004}, -{0x01EE73, 0x0080}, +{0x01EE73, 0x0001}, {0x01EE74, 0x0004}, -{0x01EE78, 0x0080}, +{0x01EE78, 0x0001}, {0x01EE79, 0x0004}, -{0x01EE7D, 0x0080}, +{0x01EE7D, 0x0001}, {0x01EE7E, 0x0004}, -{0x01EE7F, 0x0080}, +{0x01EE7F, 0x0001}, {0x01EE80, 0x0004}, -{0x01EE8A, 0x0080}, +{0x01EE8A, 0x0001}, {0x01EE8B, 0x0004}, -{0x01EE9C, 0x0080}, +{0x01EE9C, 0x0001}, {0x01EEA1, 0x0004}, -{0x01EEA4, 0x0080}, +{0x01EEA4, 0x0001}, {0x01EEA5, 0x0004}, -{0x01EEAA, 0x0080}, +{0x01EEAA, 0x0001}, {0x01EEAB, 0x0004}, -{0x01EEBC, 0x0080}, +{0x01EEBC, 0x0001}, {0x01EEF0, 0x0040}, -{0x01EEF2, 0x0080}, +{0x01EEF2, 0x0001}, {0x01F000, 0x0040}, -{0x01F02C, 0x0080}, +{0x01F02C, 0x0001}, {0x01F030, 0x0040}, -{0x01F094, 0x0080}, +{0x01F094, 0x0001}, {0x01F0A0, 0x0040}, -{0x01F0AF, 0x0080}, +{0x01F0AF, 0x0001}, {0x01F0B1, 0x0040}, -{0x01F0C0, 0x0080}, +{0x01F0C0, 0x0001}, {0x01F0C1, 0x0040}, -{0x01F0D0, 0x0080}, +{0x01F0D0, 0x0001}, {0x01F0D1, 0x0040}, -{0x01F0F6, 0x0080}, +{0x01F0F6, 0x0001}, {0x01F100, 0x0002}, {0x01F10D, 0x0040}, -{0x01F1AE, 0x0080}, +{0x01F1AE, 0x0001}, {0x01F1E6, 0x0040}, -{0x01F203, 0x0080}, +{0x01F203, 0x0001}, {0x01F210, 0x0040}, -{0x01F23C, 0x0080}, +{0x01F23C, 0x0001}, {0x01F240, 0x0040}, -{0x01F249, 0x0080}, +{0x01F249, 0x0001}, {0x01F250, 0x0040}, -{0x01F252, 0x0080}, +{0x01F252, 0x0001}, {0x01F260, 0x0040}, -{0x01F266, 0x0080}, +{0x01F266, 0x0001}, {0x01F300, 0x0040}, -{0x01F6D8, 0x0080}, +{0x01F6D8, 0x0001}, {0x01F6DC, 0x0040}, -{0x01F6ED, 0x0080}, +{0x01F6ED, 0x0001}, {0x01F6F0, 0x0040}, -{0x01F6FD, 0x0080}, +{0x01F6FD, 0x0001}, {0x01F700, 0x0040}, -{0x01F777, 0x0080}, +{0x01F777, 0x0001}, {0x01F77B, 0x0040}, -{0x01F7DA, 0x0080}, +{0x01F7DA, 0x0001}, {0x01F7E0, 0x0040}, -{0x01F7EC, 0x0080}, +{0x01F7EC, 0x0001}, {0x01F7F0, 0x0040}, -{0x01F7F1, 0x0080}, +{0x01F7F1, 0x0001}, {0x01F800, 0x0040}, -{0x01F80C, 0x0080}, +{0x01F80C, 0x0001}, {0x01F810, 0x0040}, -{0x01F848, 0x0080}, +{0x01F848, 0x0001}, {0x01F850, 0x0040}, -{0x01F85A, 0x0080}, +{0x01F85A, 0x0001}, {0x01F860, 0x0040}, -{0x01F888, 0x0080}, +{0x01F888, 0x0001}, {0x01F890, 0x0040}, -{0x01F8AE, 0x0080}, +{0x01F8AE, 0x0001}, {0x01F8B0, 0x0040}, -{0x01F8B2, 0x0080}, +{0x01F8B2, 0x0001}, {0x01F900, 0x0040}, -{0x01FA54, 0x0080}, +{0x01FA54, 0x0001}, {0x01FA60, 0x0040}, -{0x01FA6E, 0x0080}, +{0x01FA6E, 0x0001}, {0x01FA70, 0x0040}, -{0x01FA7D, 0x0080}, +{0x01FA7D, 0x0001}, {0x01FA80, 0x0040}, -{0x01FA89, 0x0080}, +{0x01FA89, 0x0001}, {0x01FA90, 0x0040}, -{0x01FABE, 0x0080}, +{0x01FABE, 0x0001}, {0x01FABF, 0x0040}, -{0x01FAC6, 0x0080}, +{0x01FAC6, 0x0001}, {0x01FACE, 0x0040}, -{0x01FADC, 0x0080}, +{0x01FADC, 0x0001}, {0x01FAE0, 0x0040}, -{0x01FAE9, 0x0080}, +{0x01FAE9, 0x0001}, {0x01FAF0, 0x0040}, -{0x01FAF9, 0x0080}, +{0x01FAF9, 0x0001}, {0x01FB00, 0x0040}, -{0x01FB93, 0x0080}, +{0x01FB93, 0x0001}, {0x01FB94, 0x0040}, -{0x01FBCB, 0x0080}, +{0x01FBCB, 0x0001}, {0x01FBF0, 0x0002}, -{0x01FBFA, 0x0080}, +{0x01FBFA, 0x0001}, {0x020000, 0x0004}, -{0x02A6E0, 0x0080}, +{0x02A6E0, 0x0001}, {0x02A700, 0x0004}, -{0x02B73A, 0x0080}, +{0x02B73A, 0x0001}, {0x02B740, 0x0004}, -{0x02B81E, 0x0080}, +{0x02B81E, 0x0001}, {0x02B820, 0x0004}, -{0x02CEA2, 0x0080}, +{0x02CEA2, 0x0001}, {0x02CEB0, 0x0004}, -{0x02EBE1, 0x0080}, +{0x02EBE1, 0x0001}, {0x02EBF0, 0x0004}, -{0x02EE5E, 0x0080}, +{0x02EE5E, 0x0001}, {0x02F800, 0x0004}, -{0x02FA1E, 0x0080}, +{0x02FA1E, 0x0001}, {0x030000, 0x0004}, -{0x03134B, 0x0080}, +{0x03134B, 0x0001}, {0x031350, 0x0004}, -{0x0323B0, 0x0080}, +{0x0323B0, 0x0001}, +{0x0E0001, 0x0080}, +{0x0E0002, 0x0001}, +{0x0E0020, 0x0080}, +{0x0E0080, 0x0001}, {0x0E0100, 0x0010}, -{0x0E01F0, 0x0080}, +{0x0E01F0, 0x0001}, +{0x0F0000, 0x0080}, +{0x0FFFFE, 0x0001}, +{0x100000, 0x0080}, +{0x10FFFE, 0x0001}, {0x110000, 0x0000}, }; const std::unordered_set unicode_set_whitespace = { -0x000009, 0x00000A, 0x00000B, 0x00000C, 0x00000D, 0x000020, 0x000085, 0x0000A0, 0x001680, 0x002000, 0x002001, 0x002002, 0x002003, 0x002004, 0x002005, 0x002006, 0x002007, 0x002008, 0x002009, 0x00200A, 0x002028, 0x002029, 0x00202F, 0x00205F, 0x003000 +0x000009, +0x00000A, +0x00000B, +0x00000C, +0x00000D, +0x000020, +0x000085, +0x0000A0, +0x001680, +0x002000, +0x002001, +0x002002, +0x002003, +0x002004, +0x002005, +0x002006, +0x002007, +0x002008, +0x002009, +0x00200A, +0x002028, +0x002029, +0x00202F, +0x00205F, +0x003000, }; const std::unordered_map unicode_map_lowercase = { @@ -3222,6 +3267,7 @@ const std::unordered_map unicode_map_lowercase = { {0x002C2C, 0x002C5C}, {0x002C2D, 0x002C5D}, {0x002C2E, 0x002C5E}, +{0x002C2F, 0x002C5F}, {0x002C60, 0x002C61}, {0x002C62, 0x00026B}, {0x002C63, 0x001D7D}, @@ -3402,12 +3448,16 @@ const std::unordered_map unicode_map_lowercase = { {0x00A7BA, 0x00A7BB}, {0x00A7BC, 0x00A7BD}, {0x00A7BE, 0x00A7BF}, +{0x00A7C0, 0x00A7C1}, {0x00A7C2, 0x00A7C3}, {0x00A7C4, 0x00A794}, {0x00A7C5, 0x000282}, {0x00A7C6, 0x001D8E}, {0x00A7C7, 0x00A7C8}, {0x00A7C9, 0x00A7CA}, +{0x00A7D0, 0x00A7D1}, +{0x00A7D6, 0x00A7D7}, +{0x00A7D8, 0x00A7D9}, {0x00A7F5, 0x00A7F6}, {0x00FF21, 0x00FF41}, {0x00FF22, 0x00FF42}, @@ -3511,6 +3561,41 @@ const std::unordered_map unicode_map_lowercase = { {0x0104D1, 0x0104F9}, {0x0104D2, 0x0104FA}, {0x0104D3, 0x0104FB}, +{0x010570, 0x010597}, +{0x010571, 0x010598}, +{0x010572, 0x010599}, +{0x010573, 0x01059A}, +{0x010574, 0x01059B}, +{0x010575, 0x01059C}, +{0x010576, 0x01059D}, +{0x010577, 0x01059E}, +{0x010578, 0x01059F}, +{0x010579, 0x0105A0}, +{0x01057A, 0x0105A1}, +{0x01057C, 0x0105A3}, +{0x01057D, 0x0105A4}, +{0x01057E, 0x0105A5}, +{0x01057F, 0x0105A6}, +{0x010580, 0x0105A7}, +{0x010581, 0x0105A8}, +{0x010582, 0x0105A9}, +{0x010583, 0x0105AA}, +{0x010584, 0x0105AB}, +{0x010585, 0x0105AC}, +{0x010586, 0x0105AD}, +{0x010587, 0x0105AE}, +{0x010588, 0x0105AF}, +{0x010589, 0x0105B0}, +{0x01058A, 0x0105B1}, +{0x01058C, 0x0105B3}, +{0x01058D, 0x0105B4}, +{0x01058E, 0x0105B5}, +{0x01058F, 0x0105B6}, +{0x010590, 0x0105B7}, +{0x010591, 0x0105B8}, +{0x010592, 0x0105B9}, +{0x010594, 0x0105BB}, +{0x010595, 0x0105BC}, {0x010C80, 0x010CC0}, {0x010C81, 0x010CC1}, {0x010C82, 0x010CC2}, @@ -3690,7 +3775,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000079, 0x000059}, {0x00007A, 0x00005A}, {0x0000B5, 0x00039C}, -{0x0000DF, 0x000053}, {0x0000E0, 0x0000C0}, {0x0000E1, 0x0000C1}, {0x0000E2, 0x0000C2}, @@ -3758,7 +3842,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000144, 0x000143}, {0x000146, 0x000145}, {0x000148, 0x000147}, -{0x000149, 0x0002BC}, {0x00014B, 0x00014A}, {0x00014D, 0x00014C}, {0x00014F, 0x00014E}, @@ -3831,7 +3914,6 @@ const std::unordered_map unicode_map_uppercase = { {0x0001EB, 0x0001EA}, {0x0001ED, 0x0001EC}, {0x0001EF, 0x0001EE}, -{0x0001F0, 0x00004A}, {0x0001F2, 0x0001F1}, {0x0001F3, 0x0001F1}, {0x0001F5, 0x0001F4}, @@ -3917,12 +3999,10 @@ const std::unordered_map unicode_map_uppercase = { {0x00037B, 0x0003FD}, {0x00037C, 0x0003FE}, {0x00037D, 0x0003FF}, -{0x000390, 0x000399}, {0x0003AC, 0x000386}, {0x0003AD, 0x000388}, {0x0003AE, 0x000389}, {0x0003AF, 0x00038A}, -{0x0003B0, 0x0003A5}, {0x0003B1, 0x000391}, {0x0003B2, 0x000392}, {0x0003B3, 0x000393}, @@ -4163,7 +4243,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000584, 0x000554}, {0x000585, 0x000555}, {0x000586, 0x000556}, -{0x000587, 0x000535}, {0x0010D0, 0x001C90}, {0x0010D1, 0x001C91}, {0x0010D2, 0x001C92}, @@ -4303,11 +4382,6 @@ const std::unordered_map unicode_map_uppercase = { {0x001E91, 0x001E90}, {0x001E93, 0x001E92}, {0x001E95, 0x001E94}, -{0x001E96, 0x000048}, -{0x001E97, 0x000054}, -{0x001E98, 0x000057}, -{0x001E99, 0x000059}, -{0x001E9A, 0x000041}, {0x001E9B, 0x001E60}, {0x001EA1, 0x001EA0}, {0x001EA3, 0x001EA2}, @@ -4393,13 +4467,9 @@ const std::unordered_map unicode_map_uppercase = { {0x001F43, 0x001F4B}, {0x001F44, 0x001F4C}, {0x001F45, 0x001F4D}, -{0x001F50, 0x0003A5}, {0x001F51, 0x001F59}, -{0x001F52, 0x0003A5}, {0x001F53, 0x001F5B}, -{0x001F54, 0x0003A5}, {0x001F55, 0x001F5D}, -{0x001F56, 0x0003A5}, {0x001F57, 0x001F5F}, {0x001F60, 0x001F68}, {0x001F61, 0x001F69}, @@ -4423,89 +4493,41 @@ const std::unordered_map unicode_map_uppercase = { {0x001F7B, 0x001FEB}, {0x001F7C, 0x001FFA}, {0x001F7D, 0x001FFB}, -{0x001F80, 0x001F08}, -{0x001F81, 0x001F09}, -{0x001F82, 0x001F0A}, -{0x001F83, 0x001F0B}, -{0x001F84, 0x001F0C}, -{0x001F85, 0x001F0D}, -{0x001F86, 0x001F0E}, -{0x001F87, 0x001F0F}, -{0x001F88, 0x001F08}, -{0x001F89, 0x001F09}, -{0x001F8A, 0x001F0A}, -{0x001F8B, 0x001F0B}, -{0x001F8C, 0x001F0C}, -{0x001F8D, 0x001F0D}, -{0x001F8E, 0x001F0E}, -{0x001F8F, 0x001F0F}, -{0x001F90, 0x001F28}, -{0x001F91, 0x001F29}, -{0x001F92, 0x001F2A}, -{0x001F93, 0x001F2B}, -{0x001F94, 0x001F2C}, -{0x001F95, 0x001F2D}, -{0x001F96, 0x001F2E}, -{0x001F97, 0x001F2F}, -{0x001F98, 0x001F28}, -{0x001F99, 0x001F29}, -{0x001F9A, 0x001F2A}, -{0x001F9B, 0x001F2B}, -{0x001F9C, 0x001F2C}, -{0x001F9D, 0x001F2D}, -{0x001F9E, 0x001F2E}, -{0x001F9F, 0x001F2F}, -{0x001FA0, 0x001F68}, -{0x001FA1, 0x001F69}, -{0x001FA2, 0x001F6A}, -{0x001FA3, 0x001F6B}, -{0x001FA4, 0x001F6C}, -{0x001FA5, 0x001F6D}, -{0x001FA6, 0x001F6E}, -{0x001FA7, 0x001F6F}, -{0x001FA8, 0x001F68}, -{0x001FA9, 0x001F69}, -{0x001FAA, 0x001F6A}, -{0x001FAB, 0x001F6B}, -{0x001FAC, 0x001F6C}, -{0x001FAD, 0x001F6D}, -{0x001FAE, 0x001F6E}, -{0x001FAF, 0x001F6F}, +{0x001F80, 0x001F88}, +{0x001F81, 0x001F89}, +{0x001F82, 0x001F8A}, +{0x001F83, 0x001F8B}, +{0x001F84, 0x001F8C}, +{0x001F85, 0x001F8D}, +{0x001F86, 0x001F8E}, +{0x001F87, 0x001F8F}, +{0x001F90, 0x001F98}, +{0x001F91, 0x001F99}, +{0x001F92, 0x001F9A}, +{0x001F93, 0x001F9B}, +{0x001F94, 0x001F9C}, +{0x001F95, 0x001F9D}, +{0x001F96, 0x001F9E}, +{0x001F97, 0x001F9F}, +{0x001FA0, 0x001FA8}, +{0x001FA1, 0x001FA9}, +{0x001FA2, 0x001FAA}, +{0x001FA3, 0x001FAB}, +{0x001FA4, 0x001FAC}, +{0x001FA5, 0x001FAD}, +{0x001FA6, 0x001FAE}, +{0x001FA7, 0x001FAF}, {0x001FB0, 0x001FB8}, {0x001FB1, 0x001FB9}, -{0x001FB2, 0x001FBA}, -{0x001FB3, 0x000391}, -{0x001FB4, 0x000386}, -{0x001FB6, 0x000391}, -{0x001FB7, 0x000391}, -{0x001FBC, 0x000391}, +{0x001FB3, 0x001FBC}, {0x001FBE, 0x000399}, -{0x001FC2, 0x001FCA}, -{0x001FC3, 0x000397}, -{0x001FC4, 0x000389}, -{0x001FC6, 0x000397}, -{0x001FC7, 0x000397}, -{0x001FCC, 0x000397}, +{0x001FC3, 0x001FCC}, {0x001FD0, 0x001FD8}, {0x001FD1, 0x001FD9}, -{0x001FD2, 0x000399}, -{0x001FD3, 0x000399}, -{0x001FD6, 0x000399}, -{0x001FD7, 0x000399}, {0x001FE0, 0x001FE8}, {0x001FE1, 0x001FE9}, -{0x001FE2, 0x0003A5}, -{0x001FE3, 0x0003A5}, -{0x001FE4, 0x0003A1}, {0x001FE5, 0x001FEC}, -{0x001FE6, 0x0003A5}, -{0x001FE7, 0x0003A5}, -{0x001FF2, 0x001FFA}, -{0x001FF3, 0x0003A9}, -{0x001FF4, 0x00038F}, -{0x001FF6, 0x0003A9}, -{0x001FF7, 0x0003A9}, -{0x001FFC, 0x0003A9}, +{0x001FF3, 0x001FFC}, {0x00214E, 0x002132}, {0x002170, 0x002160}, {0x002171, 0x002161}, @@ -4597,6 +4619,7 @@ const std::unordered_map unicode_map_uppercase = { {0x002C5C, 0x002C2C}, {0x002C5D, 0x002C2D}, {0x002C5E, 0x002C2E}, +{0x002C5F, 0x002C2F}, {0x002C61, 0x002C60}, {0x002C65, 0x00023A}, {0x002C66, 0x00023E}, @@ -4800,9 +4823,13 @@ const std::unordered_map unicode_map_uppercase = { {0x00A7BB, 0x00A7BA}, {0x00A7BD, 0x00A7BC}, {0x00A7BF, 0x00A7BE}, +{0x00A7C1, 0x00A7C0}, {0x00A7C3, 0x00A7C2}, {0x00A7C8, 0x00A7C7}, {0x00A7CA, 0x00A7C9}, +{0x00A7D1, 0x00A7D0}, +{0x00A7D7, 0x00A7D6}, +{0x00A7D9, 0x00A7D8}, {0x00A7F6, 0x00A7F5}, {0x00AB53, 0x00A7B3}, {0x00AB70, 0x0013A0}, @@ -4885,18 +4912,6 @@ const std::unordered_map unicode_map_uppercase = { {0x00ABBD, 0x0013ED}, {0x00ABBE, 0x0013EE}, {0x00ABBF, 0x0013EF}, -{0x00FB00, 0x000046}, -{0x00FB01, 0x000046}, -{0x00FB02, 0x000046}, -{0x00FB03, 0x000046}, -{0x00FB04, 0x000046}, -{0x00FB05, 0x000053}, -{0x00FB06, 0x000053}, -{0x00FB13, 0x000544}, -{0x00FB14, 0x000544}, -{0x00FB15, 0x000544}, -{0x00FB16, 0x00054E}, -{0x00FB17, 0x000544}, {0x00FF41, 0x00FF21}, {0x00FF42, 0x00FF22}, {0x00FF43, 0x00FF23}, @@ -4999,6 +5014,41 @@ const std::unordered_map unicode_map_uppercase = { {0x0104F9, 0x0104D1}, {0x0104FA, 0x0104D2}, {0x0104FB, 0x0104D3}, +{0x010597, 0x010570}, +{0x010598, 0x010571}, +{0x010599, 0x010572}, +{0x01059A, 0x010573}, +{0x01059B, 0x010574}, +{0x01059C, 0x010575}, +{0x01059D, 0x010576}, +{0x01059E, 0x010577}, +{0x01059F, 0x010578}, +{0x0105A0, 0x010579}, +{0x0105A1, 0x01057A}, +{0x0105A3, 0x01057C}, +{0x0105A4, 0x01057D}, +{0x0105A5, 0x01057E}, +{0x0105A6, 0x01057F}, +{0x0105A7, 0x010580}, +{0x0105A8, 0x010581}, +{0x0105A9, 0x010582}, +{0x0105AA, 0x010583}, +{0x0105AB, 0x010584}, +{0x0105AC, 0x010585}, +{0x0105AD, 0x010586}, +{0x0105AE, 0x010587}, +{0x0105AF, 0x010588}, +{0x0105B0, 0x010589}, +{0x0105B1, 0x01058A}, +{0x0105B3, 0x01058C}, +{0x0105B4, 0x01058D}, +{0x0105B5, 0x01058E}, +{0x0105B6, 0x01058F}, +{0x0105B7, 0x010590}, +{0x0105B8, 0x010591}, +{0x0105B9, 0x010592}, +{0x0105BB, 0x010594}, +{0x0105BC, 0x010595}, {0x010CC0, 0x010C80}, {0x010CC1, 0x010C81}, {0x010CC2, 0x010C82}, diff --git a/unicode.cpp b/unicode.cpp index 2f8d73832..913c34b9b 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -226,8 +226,9 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -309,7 +310,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -344,8 +345,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -450,7 +452,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -679,10 +681,14 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); + const auto flags = unicode_cpt_flags(cpts[i]); - if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_flag); + if (flags.is_whitespace) { + //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. + //text_collapsed[i] = (char) 0x85; // as whitespace fallback + text_collapsed[i] = (char) 0x0B; // as whitespace fallback + } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); } else { text_collapsed[i] = (char) 0xD0; // fallback } @@ -766,9 +772,16 @@ std::vector unicode_regex_split(const std::string & text, const std bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); } else { // no unicode category used, we can use std::wregex directly - const std::wstring wtext = unicode_wstring_from_utf8(text); const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback + std::wstring wtext(cpts.begin(), cpts.end()); + for (size_t i = 0; i < wtext.size(); ++i) { + if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { + wtext[i] = 0x0B; + } + } + //printf("text: %s\n", text.c_str()); //printf("regex_expr: %s\n", regex_expr.c_str()); bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);