From 55f23826634d5d8351087bd9900cfadf80f41d73 Mon Sep 17 00:00:00 2001 From: AWE Henry Date: Sat, 25 Nov 2023 11:18:27 +0800 Subject: [PATCH] update codepoint --- include/papilio/utf/codepoint.hpp | 56 +++++++--- include/papilio/utf/codepoint.inl | 71 +++++++++++- src/utf/codepoint.cpp | 83 +------------- test/test_utf_codepoint.cpp | 173 +++++++++++++++++++++++++++--- 4 files changed, 272 insertions(+), 111 deletions(-) diff --git a/include/papilio/utf/codepoint.hpp b/include/papilio/utf/codepoint.hpp index b1ab66c..f3762cd 100644 --- a/include/papilio/utf/codepoint.hpp +++ b/include/papilio/utf/codepoint.hpp @@ -11,6 +11,7 @@ #ifdef PAPILIO_COMPILER_MSVC # pragma warning(push) +// Variable 'variable' is uninitialized. Always initialize a member variable (type.6). # pragma warning(disable:26495) #endif @@ -39,13 +40,11 @@ namespace papilio::utf class decoder { public: - static auto to_char32_t(std::u16string_view ch) -> std::pair; - static auto to_char32_t(char16_t first, char16_t second = u'\0') -> std::pair; + static constexpr auto to_char32_t(std::u16string_view ch) -> std::pair; + static constexpr auto to_char32_t(char16_t first, char16_t second = u'\0') -> std::pair; - static std::uint8_t size_bytes(char16_t ch) noexcept; - - static auto to_codepoint(std::u16string_view ch) -> std::pair; - static auto to_codepoint(char16_t first, char16_t second = u'\0') -> std::pair; + static constexpr auto to_codepoint(std::u16string_view ch) -> std::pair; + static constexpr auto to_codepoint(char16_t first, char16_t second = u'\0') -> std::pair; struct from_codepoint_result { @@ -59,17 +58,17 @@ namespace papilio::utf std::uint8_t processed_size = 0; [[nodiscard]] - std::u16string_view get() const noexcept + constexpr std::u16string_view get() const noexcept { return std::u16string_view(chars, size); } - operator std::u16string_view() const noexcept + constexpr operator std::u16string_view() const noexcept { return get(); } }; - static auto from_codepoint(codepoint cp) -> from_codepoint_result; + static constexpr auto from_codepoint(codepoint cp) -> from_codepoint_result; }; template <> class decoder @@ -220,7 +219,7 @@ namespace papilio::utf return std::bit_cast(u8data()); } [[nodiscard]] - constexpr std::uint8_t size() const noexcept + constexpr std::uint8_t size_bytes() const noexcept { return byte_count(m_data[0]); } @@ -231,11 +230,11 @@ namespace papilio::utf } explicit constexpr operator std::u8string_view() const noexcept { - return std::u8string_view(m_data, size()); + return std::u8string_view(m_data, size_bytes()); } explicit constexpr operator std::string_view() const noexcept { - return std::string_view(data(), size()); + return std::string_view(data(), size_bytes()); } template @@ -249,7 +248,7 @@ namespace papilio::utf static_cast(m_data[3]) }; - return std::make_pair(arr, size()); + return std::make_pair(arr, size_bytes()); } constexpr bool operator==(codepoint rhs) const noexcept @@ -289,6 +288,37 @@ namespace papilio::utf friend std::basic_ostream& operator<<(std::basic_ostream& os, codepoint cp); friend std::basic_ostream& operator<<(std::basic_ostream& os, codepoint cp); + constexpr std::size_t estimate_width() const + { + // [begin, end) intervals + constexpr std::pair estimate_intervals[] = + { + { 0x1100u, 0x1160u }, + { 0x2329u, 0x232Bu }, + { 0x2E80u, 0x303Fu }, + { 0x3040u, 0xA4D0u }, + { 0xAC00u, 0xD7A4u }, + { 0xF900u, 0xFB00u }, + { 0xFE10u, 0xFE1Au }, + { 0xFE30u, 0xFE70u }, + { 0xFF00u, 0xFF61u }, + { 0xFFE0u, 0xFFE7u }, + { 0x1F300u, 0x1F650u }, + { 0x1F900u, 0x1FA00u }, + { 0x20000u, 0x2FFFEu }, + { 0x30000u, 0x3FFFEu } + }; + + char32_t ch = static_cast(*this); + for(const auto& i : estimate_intervals) + { + if(i.first <= ch && ch < i.second) + return 2; + } + + return 1; + } + private: char8_t m_data[4]; }; diff --git a/include/papilio/utf/codepoint.inl b/include/papilio/utf/codepoint.inl index c81a8f6..046a523 100644 --- a/include/papilio/utf/codepoint.inl +++ b/include/papilio/utf/codepoint.inl @@ -17,7 +17,7 @@ namespace papilio::utf return 4; } - constexpr auto decoder::to_codepoint(char32_t ch)->std::pair + constexpr auto decoder::to_codepoint(char32_t ch) -> std::pair { std::uint8_t len = size_bytes(ch); char8_t bytes[4] = { 0, 0, 0, 0 }; @@ -49,7 +49,7 @@ namespace papilio::utf PAPILIO_UNREACHABLE(); } - return std::make_pair(codepoint(bytes, len), len); + return std::make_pair(codepoint(bytes, len), 1); } constexpr auto decoder::from_codepoint(codepoint cp) noexcept -> std::pair @@ -113,4 +113,71 @@ namespace papilio::utf std::uint8_t len = size_bytes(ch[0]); return std::make_pair(codepoint(ch.data(), len), len); } + + constexpr auto decoder::to_char32_t(std::u16string_view ch) -> std::pair + { + if(ch.empty()) [[unlikely]] + return std::make_pair(U'\0', 0); + + if(!is_high_surrogate(ch[0])) [[likely]] + { + char32_t result = ch[0]; + return std::make_pair(result, 1); + } + else + { + if(ch.size() < 2) [[unlikely]] + throw invalid_surrogate(ch[0]); + else if(!is_low_surrogate(ch[1])) [[unlikely]] + throw invalid_surrogate(ch[1]); + + char32_t result = + (ch[0] - 0xD800 << 10) + + (ch[1] - 0xDC00) + + 0x10000; + return std::make_pair(result, 2); + } + } + constexpr auto decoder::to_char32_t(char16_t first, char16_t second) -> std::pair + { + char16_t tmp[2] = { first, second }; + + return to_char32_t(std::u16string_view(tmp, 2)); + } + + constexpr auto decoder::to_codepoint(std::u16string_view ch) -> std::pair + { + auto [ch32, processed_size] = to_char32_t(ch); + return std::make_pair(decoder::to_codepoint(ch32).first, processed_size); + } + constexpr auto decoder::to_codepoint(char16_t first, char16_t second) -> std::pair + { + return decoder::to_codepoint(to_char32_t(first, second).first); + } + + constexpr auto decoder::from_codepoint(codepoint cp) -> from_codepoint_result + { + from_codepoint_result result; + + char32_t ch32; + std::tie(ch32, result.processed_size) = decoder::from_codepoint(cp); + if(ch32 <= 0xD7FF || (0xE000 <= ch32 && ch32 <= 0xFFFF)) + { + result.chars[0] = static_cast(ch32); + result.size = 1; + } + else if(0x10000 <= ch32 && ch32 <= 0x10FFFF) + { + std::uint32_t tmp = ch32 - 0x10000; + result.chars[0] = 0xD800 + (tmp >> 10); + result.chars[1] = 0xDC00 + (tmp & 0x3FF); + result.size = 2; + } + else + { + throw std::invalid_argument("invalid codepoint"); // TODO: Better exception + } + + return result; + } } diff --git a/src/utf/codepoint.cpp b/src/utf/codepoint.cpp index 10c4d7a..1e39c37 100644 --- a/src/utf/codepoint.cpp +++ b/src/utf/codepoint.cpp @@ -4,85 +4,7 @@ namespace papilio::utf { - auto decoder::to_char32_t(std::u16string_view ch) -> std::pair - { - if(ch.empty()) [[unlikely]] - return std::make_pair(U'\0', 0); - - if(!is_high_surrogate(ch[0])) [[likely]] - { - char32_t result = ch[0]; - return std::make_pair(result, 1); - } - else - { - if(ch.size() < 2) [[unlikely]] - throw invalid_surrogate(ch[0]); - else if(!is_low_surrogate(ch[1])) [[unlikely]] - throw invalid_surrogate(ch[1]); - - char32_t result = - (ch[0] - 0xD800 << 10) + - (ch[1] - 0xDC00) + - 0x10000; - return std::make_pair(result, 2); - } - } - auto decoder::to_char32_t(char16_t first, char16_t second) -> std::pair - { - char16_t tmp[2] = { first, second }; - - return to_char32_t(std::u16string_view(tmp, 2)); - } - - std::uint8_t decoder::size_bytes(char16_t ch) noexcept - { - if(!is_high_surrogate(ch)) - { - return 1; - } - else - { - return 2; - } - } - - auto decoder::to_codepoint(std::u16string_view ch) -> std::pair - { - return decoder::to_codepoint(to_char32_t(ch).first); - } - auto decoder::to_codepoint(char16_t first, char16_t second) -> std::pair - { - return decoder::to_codepoint(to_char32_t(first, second).first); - } - - auto decoder::from_codepoint(codepoint cp) -> from_codepoint_result - { - from_codepoint_result result; - - char32_t ch32; - std::tie(ch32, result.processed_size) = decoder::from_codepoint(cp); - if(ch32 <= 0xD7FF || (0xE000 <= ch32 && ch32 <=0xFFFF)) - { - result.chars[0] = static_cast(ch32); - result.size = 1; - } - else if(0x10000 <= ch32 && ch32 <= 0x10FFFF) - { - std::uint32_t tmp = ch32 - 0x10000; - result.chars[0] = 0xD800 + (tmp >> 10); - result.chars[1] = 0xDC00 + (tmp & 0x3FF); - result.size = 2; - } - else - { - throw std::invalid_argument("invalid codepoint"); // TODO: Better exception - } - - return result; - } - - auto decoder::to_char32_t(std::wstring_view ch)->std::pair + auto decoder::to_char32_t(std::wstring_view ch) -> std::pair { if(ch.empty()) [[unlikely]] return std::make_pair(U'\0', 0); @@ -100,7 +22,8 @@ namespace papilio::utf auto decoder::to_codepoint(std::wstring_view ch) -> std::pair { - return decoder::to_codepoint(to_char32_t(ch).first); + auto [ch32, processed_size] = to_char32_t(ch); + return std::make_pair(decoder::to_codepoint(ch32).first, processed_size); } auto decoder::from_codepoint(codepoint cp) -> from_codepoint_result diff --git a/test/test_utf_codepoint.cpp b/test/test_utf_codepoint.cpp index 02600e1..7bf925f 100644 --- a/test/test_utf_codepoint.cpp +++ b/test/test_utf_codepoint.cpp @@ -3,21 +3,102 @@ #include -TEST(TestUTFCodepoint, Codepoint) +// codepoint should be POD +static_assert(std::is_trivial_v); +static_assert(std::is_standard_layout_v); + +TEST(decoder, char8_t) +{ + using namespace papilio; + using namespace utf; + + // Test data + + // "Γ„" + constexpr char8_t capital_a_with_diaeresis[] = u8"\u00c4"; + // CJK Unified Ideographs 4E00 + // "δΈ€" + constexpr char8_t cjk_4e00[] = u8"\u4e00"; + // Peach Emoji + // "πŸ‘" + constexpr char8_t peach_emoji[] = u8"\U0001f351"; + + { + static_assert(decoder::size_bytes(u8"A"[0]) == 1); + static_assert(decoder::size_bytes(capital_a_with_diaeresis[0]) == 2); + static_assert(decoder::size_bytes(cjk_4e00[0]) == 3); + static_assert(decoder::size_bytes(peach_emoji[0]) == 4); + } + + { + codepoint cp; + std::uint8_t size; + + std::tie(cp, size) = decoder::to_codepoint(u8"A"); + EXPECT_EQ(cp, U'A'); + EXPECT_EQ(size, cp.size_bytes()); + } +} + +TEST(decoder, char16_t) { using namespace papilio; using namespace utf; + // Test data + + // "Γ„" + constexpr char16_t capital_a_with_diaeresis = u'\u00c4'; + // CJK Unified Ideographs 4E00 + // "δΈ€" + constexpr char16_t cjk_4e00 = u'\u4e00'; + // Peach Emoji + // "πŸ‘" + constexpr char16_t peach_emoji[] = u"\U0001f351"; + { - static_assert(std::is_trivial_v); - static_assert(std::is_standard_layout_v); + codepoint cp; + std::uint8_t processed_size; + + std::tie(cp, processed_size) = decoder::to_codepoint(u"A"); + EXPECT_EQ(cp, U'A'); + EXPECT_EQ(processed_size, 1); + + std::tie(cp, processed_size) = decoder::to_codepoint(capital_a_with_diaeresis); + EXPECT_EQ(cp, U'\u00c4'); + EXPECT_EQ(processed_size, 1); + + std::tie(cp, processed_size) = decoder::to_codepoint(cjk_4e00); + EXPECT_EQ(cp, U'\u4e00'); + EXPECT_EQ(processed_size, 1); + + std::tie(cp, processed_size) = decoder::to_codepoint(peach_emoji); + EXPECT_EQ(cp, U'\U0001f351'); + EXPECT_EQ(processed_size, 2); } +} + +TEST(decoder, char32_t) +{ + using namespace papilio; + using namespace utf; + + // Test data + + // "Γ„" + constexpr char32_t capital_a_with_diaeresis = U'\u00c4'; + // CJK Unified Ideographs 4E00 + // "δΈ€" + constexpr char32_t cjk_4e00 = U'\u4e00'; + // Peach Emoji + // "πŸ‘" + constexpr char32_t peach_emoji = U'\U0001f351'; { static_assert(decoder::size_bytes(U'A') == 1); - static_assert(decoder::size_bytes(U'Γ„') == 2); - static_assert(decoder::size_bytes(U'ζˆ‘') == 3); - static_assert(decoder::size_bytes(U'πŸ”Š') == 4); + static_assert(decoder::size_bytes(capital_a_with_diaeresis) == 2); + static_assert(decoder::size_bytes(cjk_4e00) == 3); + static_assert(decoder::size_bytes(peach_emoji) == 4); } { @@ -25,19 +106,79 @@ TEST(TestUTFCodepoint, Codepoint) cp = decoder::to_codepoint(U'A').first; EXPECT_EQ(cp, U'A'); - EXPECT_EQ(cp.size(), 1); + EXPECT_EQ(cp.size_bytes(), 1); + + cp = decoder::to_codepoint(capital_a_with_diaeresis).first; + EXPECT_EQ(cp, capital_a_with_diaeresis); + EXPECT_EQ(cp.size_bytes(), 2); + + cp = decoder::to_codepoint(cjk_4e00).first; + EXPECT_EQ(cp, cjk_4e00); + EXPECT_EQ(cp.size_bytes(), 3); + + cp = decoder::to_codepoint(peach_emoji).first; + EXPECT_EQ(cp, peach_emoji); + EXPECT_EQ(cp.size_bytes(), 4); + } +} + +TEST(decoder, wchar_t) +{ + using namespace papilio; + using namespace utf; + + // Test data + + // "Γ„" + constexpr wchar_t capital_a_with_diaeresis[] = L"\u00c4"; + // CJK Unified Ideographs 4E00 + // "δΈ€" + constexpr wchar_t cjk_4e00[] = L"\u4e00"; + // Peach Emoji + // "πŸ‘" + constexpr wchar_t peach_emoji[] = L"\U0001f351"; + + { + codepoint cp; + std::uint8_t processed_size; + + std::tie(cp, processed_size) = decoder::to_codepoint(L"A"); + EXPECT_EQ(cp, U'A'); + EXPECT_EQ(processed_size, 1); + EXPECT_EQ(cp.size_bytes(), 1); + + std::tie(cp, processed_size) = decoder::to_codepoint(capital_a_with_diaeresis); + EXPECT_EQ(cp, U'\u00c4'); + EXPECT_EQ(processed_size, 1); + EXPECT_EQ(cp.size_bytes(), 2); + + std::tie(cp, processed_size) = decoder::to_codepoint(cjk_4e00); + EXPECT_EQ(cp, U'\u4e00'); + EXPECT_EQ(processed_size, 1); + EXPECT_EQ(cp.size_bytes(), 3); + + std::tie(cp, processed_size) = decoder::to_codepoint(peach_emoji); + EXPECT_EQ(cp, U'\U0001f351'); + EXPECT_EQ(processed_size, (sizeof(wchar_t) == sizeof(char32_t) ? 1 : 2)); + EXPECT_EQ(cp.size_bytes(), 4); + } +} - cp = decoder::to_codepoint(U'Γ„').first; - EXPECT_EQ(cp, U'Γ„'); - EXPECT_EQ(cp.size(), 2); +TEST(codepoint, estimate_width) +{ + using namespace papilio; + using namespace utf; - cp = decoder::to_codepoint(U'ζˆ‘').first; - EXPECT_EQ(cp, U'ζˆ‘'); - EXPECT_EQ(cp.size(), 3); + { + codepoint a = U'a'_cp; + EXPECT_EQ(a.estimate_width(), 1); + } - cp = decoder::to_codepoint(U'πŸ”Š').first; - EXPECT_EQ(cp, U'πŸ”Š'); - EXPECT_EQ(cp.size(), 4); + { + // CJK Unified Ideographs 6587 + // "ζ–‡" + codepoint cjk_6587 = U'\u6587'_cp; + EXPECT_EQ(cjk_6587.estimate_width(), 2); } }