2024-03-11 16:47:47 +01:00
|
|
|
#pragma once
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-03-11 16:47:47 +01:00
|
|
|
#include <cstdint>
|
2024-01-21 16:17:35 +01:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2023-10-03 09:16:26 +02:00
|
|
|
|
|
|
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
2024-05-04 07:32:32 +02:00
|
|
|
#define CODEPOINT_TYPE_NUMBER 1
|
2024-03-11 16:47:47 +01:00
|
|
|
#define CODEPOINT_TYPE_LETTER 2
|
2024-05-09 15:30:44 +02:00
|
|
|
#define CODEPOINT_TYPE_SEPARATOR 3
|
2024-03-11 16:47:47 +01:00
|
|
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
|
|
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
|
|
|
#define CODEPOINT_TYPE_SYMBOL 6
|
|
|
|
#define CODEPOINT_TYPE_CONTROL 7
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-03-11 16:47:47 +01:00
|
|
|
std::string unicode_cpt_to_utf8(uint32_t cp);
|
|
|
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-03-11 16:47:47 +01:00
|
|
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-03-11 16:47:47 +01:00
|
|
|
int unicode_cpt_type(uint32_t cp);
|
|
|
|
int unicode_cpt_type(const std::string & utf8);
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-05-09 15:30:44 +02:00
|
|
|
bool unicode_cpt_is_whitespace(uint32_t cp);
|
|
|
|
|
2024-03-11 16:47:47 +01:00
|
|
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
|
|
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2024-03-26 22:46:21 +01:00
|
|
|
char32_t unicode_tolower(char32_t cp);
|
2024-04-29 15:58:41 +02:00
|
|
|
|
|
|
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|