mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
92139b90af
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
17 lines
806 B
C++
17 lines
806 B
C++
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
|
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
|
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|